Mercurial > repos > eschen42 > mqppep_anova
comparison mqppep_anova_script.Rmd @ 26:5b8e15b2a67c draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit e0b80550743f634282b4b4348b75e6f172dc1488
| author | eschen42 |
|---|---|
| date | Wed, 26 Oct 2022 23:48:51 +0000 |
| parents | f9cd87ac8006 |
| children | 42b207aaa527 |
comparison
equal
deleted
inserted
replaced
| 25:f9cd87ac8006 | 26:5b8e15b2a67c |
|---|---|
| 5 - "Larry Cheng^[ORCiD 0000-0002-6922-6433, Rutgers School of Graduate Studies: New Brunswick, NJ, US]" | 5 - "Larry Cheng^[ORCiD 0000-0002-6922-6433, Rutgers School of Graduate Studies: New Brunswick, NJ, US]" |
| 6 - "Art Eschenlauer^[ORCiD 0000-0002-2882-0508, University of Minnesota: Minneapolis, Minnesota, US]" | 6 - "Art Eschenlauer^[ORCiD 0000-0002-2882-0508, University of Minnesota: Minneapolis, Minnesota, US]" |
| 7 date: | 7 date: |
| 8 - "May 28, 2018" | 8 - "May 28, 2018" |
| 9 - "; revised June 23, 2022" | 9 - "; revised June 23, 2022" |
| 10 lot: true | |
| 10 output: | 11 output: |
| 11 pdf_document: | 12 pdf_document: |
| 12 toc: true | 13 toc: true |
| 13 toc_depth: 3 | 14 toc_depth: 2 |
| 14 keep_tex: true | 15 keep_tex: true |
| 15 header-includes: | 16 dev: pdf |
| 16 - \usepackage{longtable} | 17 includes: |
| 17 - \newcommand\T{\rule{0pt}{2.6ex}} % Top strut | 18 in_header: mqppep_anova_preamble.tex |
| 18 - \newcommand\B{\rule[-1.2ex]{0pt}{0pt}} % Bottom strut | 19 latex_macros: false |
| 20 raw_tex: true | |
| 21 urlcolor: blue | |
| 19 params: | 22 params: |
| 20 alphaFile: "test-data/alpha_levels.tabular" | 23 alphaFile: "test-data/alpha_levels.tabular" |
| 21 inputFile: "test-data/test_input_for_anova.tabular" | 24 inputFile: "test-data/test_input_for_anova.tabular" |
| 22 preprocDb: "test-data/test_input_for_anova.sqlite" | 25 preprocDb: "test-data/test_input_for_anova.sqlite" |
| 23 kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] | 26 kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] |
| 24 regexSampleNames: "\\.\\d+[A-Z]$" | 27 regexSampleNames: "\\.\\d+[A-Z]$" |
| 25 regexSampleGrouping: "\\d+" | 28 regexSampleGrouping: "\\d+" |
| 26 show_toc: true | 29 groupFilterPatterns: ".+" |
| 30 groupFilter: !r c("none", "exclude", "include")[1] | |
| 31 imputationMethod: !r c("group-median", "median", "mean", "random")[4] | |
| 32 kseaCutoffThreshold: !r c(0.05, 0.1, 0.25, 0.5, 0.9)[5] | |
| 33 #imputationMethod: !r c("group-median", "median", "mean", "random")[1] | |
| 34 | |
| 35 # how should sample groups be interpreted? | |
| 36 # - "f": fixed patterns (like `grep -F`) | |
| 37 # - "p": PERL-compatible (like `grep -P`) | |
| 38 # - "r": extended grep patterns (like `grep -E`) | |
| 39 # use what case sensitivity? | |
| 40 # - "i": case insensitive matching (like `grep -i`) | |
| 41 groupFilterMode: !r c("r", "ri", "p", "pi", "f", "fi")[1] | |
| 42 # what pattern should be used for the first column | |
| 43 # (extended grep pattern, case sensitive) | |
| 27 firstDataColumn: "^Intensity[^_]" | 44 firstDataColumn: "^Intensity[^_]" |
| 28 imputationMethod: !r c("group-median", "median", "mean", "random")[1] | 45 # for small random value imputation, what percentile should be center? |
| 29 meanPercentile: 1 | 46 meanPercentile: 50 |
| 47 #meanPercentile: 1 | |
| 48 # for small random value imputation, what should `s / mean(x)` ratio be? | |
| 30 sdPercentile: 1.0 | 49 sdPercentile: 1.0 |
| 50 # output path for imputed data file | |
| 31 imputedDataFilename: "test-data/limbo/imputedDataFilename.txt" | 51 imputedDataFilename: "test-data/limbo/imputedDataFilename.txt" |
| 52 # output path for imputed/quantile-normalized/log-transformed data file | |
| 32 imputedQNLTDataFile: "test-data/limbo/imputedQNLTDataFile.txt" | 53 imputedQNLTDataFile: "test-data/limbo/imputedQNLTDataFile.txt" |
| 54 # output path for contents of `stats_metadata_v` table | |
| 33 anovaKseaMetadata: "test-data/limbo/anovaKseaMetadata.txt" | 55 anovaKseaMetadata: "test-data/limbo/anovaKseaMetadata.txt" |
| 56 # how to test one variable with > 2 categories (e.g., aov or kruskal.test) | |
| 34 oneWayManyCategories: !r c("aov", "kruskal.test", "oneway.test")[1] | 57 oneWayManyCategories: !r c("aov", "kruskal.test", "oneway.test")[1] |
| 58 # how to test one variable with 2 categories (e.g., oneway.test) | |
| 35 oneWayTwoCategories: !r c("aov", "kruskal.test", "oneway.test")[3] | 59 oneWayTwoCategories: !r c("aov", "kruskal.test", "oneway.test")[3] |
| 36 kseaCutoffStatistic: !r c("p.value", "FDR")[2] | 60 # what should be the minimum quality for consideration in both |
| 37 kseaCutoffThreshold: !r c( 0.1, 0.05)[2] | 61 minQuality: 0 |
| 38 kseaMinKinaseCount: 1 | 62 # correct KSEA with FDR (recommended) or raw p-value |
| 39 intensityHeatmapRows: 75 | 63 kseaCutoffStatistic: !r c("FDR", "p.value")[1] |
| 64 # correct KSEA threshold 0.05 (conventional) or higher (perhaps better) | |
| 65 # "perhaps better" meaning that KSEA is an hypothesis-generator, not -test | |
| 66 #kseaCutoffThreshold: !r c(0.05, 0.1, 0.25, 0.5)[1] | |
| 67 # minimum number of substrates required for a kinase to be considered in KSEA | |
| 68 kseaMinSubstrateCount: 1 | |
| 69 # Should KSEA be performed aggregating signed log2FC or absolute? | |
| 70 # FALSE use raw log2FC for KSEA as for KSEAapp::KSEA.Scores | |
| 71 # TRUE use abs(log2FC) for KSEA as Justin Drake requested; this is a | |
| 72 # justifiable deviation from the KSEAapp::KSEA.Scores algorithm. | |
| 73 kseaUseAbsoluteLog2FC: TRUE | |
| 74 #kseaUseAbsoluteLog2FC: FALSE | |
| 75 # minimum number of observed values per sample-group | |
| 76 intensityMinValuesPerGroup: 1 | |
| 77 # maximum number of heatmap rows (result are poor when > 50) | |
| 78 intensityHeatmapRows: 50 | |
| 79 # what should be the primary criterion to eliminate excessive heatmap rows | |
| 80 intensityHeatmapCriteria: !r c("quality", "na_count", "p_value")[1] | |
| 81 # should correlation among substrates be used (rather than covariance) | |
| 82 correlateSubstrates: TRUE | |
| 83 # only show covariance among variables having variance > 1 | |
| 84 filterCovVarGT1: TRUE | |
| 85 # maximum number of residues to display for ppeps in rownames or columnames | |
| 86 ppepTruncN: 10 | |
| 87 # maximum number of characters of subgenes to display in rownames or columnames | |
| 88 subgeneTruncN: 10 | |
| 89 # maximum number of characters for paste(subgene, ppep) for enrichment plots | |
| 90 substTruncN: 20 | |
| 91 # should boxplots use variable-width boxes to reflect # of samples | |
| 92 boxPlotVarWidth: TRUE | |
| 93 # should boxplots use notched boxes to reflect difference between samples | |
| 94 boxPlotNotch: TRUE | |
| 95 # look-up tables for kinase descriptions | |
| 96 kinaseNameUprtLutBz2: "./kinase_name_uniprot_lut.tabular.bz2" | |
| 97 kinaseUprtDescLutBz2: "./kinase_uniprot_description_lut.tabular.bz2" | |
| 98 # should debugging trace messages be printed? | |
| 99 showEnrichedSubstrates: FALSE | |
| 100 | |
| 101 # should debugging nb/nbe messages be printed? | |
| 102 printNBMsgs: FALSE | |
| 103 # should debugging trace messages be printed? | |
| 104 printTraceMsgs: FALSE | |
| 105 # when debugging files are needed, set debugFileBasePath to the path | |
| 106 # to the directory where they should be writtn | |
| 107 debugFileBasePath: !r if (TRUE) NULL else "test-data" | |
| 40 --- | 108 --- |
| 41 <!-- | 109 <!-- |
| 110 alphaFile: "test-data/alpha_levels.tabular" | |
| 111 inputFile: "test-data/PDX_pST_by_trt.ppep_intensities.ppep_map.preproc_tab.tabular" | |
| 112 preprocDb: "test-data/PDX_pST_by_trt.ppep_intensities.ppep_map.preproc_sqlite.sqlite" | |
| 113 kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] | |
| 114 regexSampleNames: "\\.\\w+\\.\\d+[A-Z]$" | |
| 115 regexSampleGrouping: "\\w+" | |
| 116 groupFilterPatterns: ".+" | |
| 117 groupFilter: !r c("none", "exclude", "include")[3] | |
| 118 imputationMethod: !r c("group-median", "median", "mean", "random")[1] | |
| 119 kseaCutoffThreshold: !r c(0.05, 0.1, 0.20, 0.35, 0.4, 0.5, 0.999)[1] | |
| 120 ut_alphaFile: "test-data/alpha_levels.tabular" | |
| 121 ut_inputFile: "test-data/UT_phospho_ST_sites.preproc.tabular" | |
| 122 ut_preprocDb: "test-data/UT_phospho_ST_sites.preproc.sqlite" | |
| 123 ut_kseaAppPrepDb: !r c(":memory:", "test-data/UT_phospho_ST_sites.ksea.sqlite")[2] | |
| 124 ut_regexSampleNames: "\\.\\d+[A-Z]$" | |
| 125 ut_regexSampleGrouping: "\\d+" | |
| 126 ut_groupFilterPatterns: ".+,.*" | |
| 127 ut_groupFilter: !r c("none", "exclude", "include")[1] | |
| 128 ut_imputationMethod: !r c("group-median", "median", "mean", "random")[4] | |
| 129 ut_kseaCutoffThreshold: !r c(0.05, 0.1, 0.25, 0.5, 0.9)[1] | |
| 130 tst_alphaFile: "test-data/alpha_levels.tabular" | |
| 131 tst_inputFile: "test-data/test_input_for_anova.tabular" | |
| 132 tst_preprocDb: "test-data/test_input_for_anova.sqlite" | |
| 133 tst_kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] | |
| 134 tst_regexSampleNames: "\\.\\d+[A-Z]$" | |
| 135 tst_regexSampleGrouping: "\\d+" | |
| 136 tst_groupFilterPatterns: ".+" | |
| 137 tst_groupFilter: !r c("none", "exclude", "include")[1] | |
| 138 tst_imputationMethod: !r c("group-median", "median", "mean", "random")[4] | |
| 139 tst_kseaCutoffThreshold: !r c(0.05, 0.1, 0.25, 0.5, 0.9)[5] | |
| 140 | |
| 141 tst_alphaFile: "test-data/alpha_levels.tabular" | |
| 142 tst_inputFile: "test-data/UT_phospho_ST_sites.preproc.tabular" | |
| 143 tst_preprocDb: "test-data/UT_phospho_ST_sites.preproc.sqlite" | |
| 144 tst_kseaAppPrepDb: !r c(":memory:", "test-data/UT_phospho_ST_sites.ksea.sqlite")[2] | |
| 145 tst_regexSampleNames: "\\.\\d+[A-Z]$" | |
| 146 tst_regexSampleGrouping: "\\d+" | |
| 147 tst_groupFilterPatterns: ".+,.*" | |
| 148 tst_groupFilter: !r c("none", "exclude", "include")[1] | |
| 149 tst_imputationMethod: !r c("group-median", "median", "mean", "random")[4] | |
| 150 tst_kseaCutoffThreshold: !r c(0.05, 0.1, 0.20, 0.35, 0.4, 0.5, 0.999)[5] | |
| 151 px_alphaFile: "test-data/alpha_levels.tabular" | |
| 152 px_inputFile: "test-data/PDX_pST_by_trt.ppep_intensities.ppep_map.preproc_tab.tabular" | |
| 153 px_preprocDb: "test-data/PDX_pST_by_trt.ppep_intensities.ppep_map.preproc_sqlite.sqlite" | |
| 154 px_kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] | |
| 155 px_regexSampleNames: "\\.\\w+\\.\\d+[A-Z]$" | |
| 156 px_regexSampleGrouping: "\\w+" | |
| 157 px_groupFilterPatterns: ".+" | |
| 158 px_groupFilter: !r c("none", "exclude", "include")[3] | |
| 159 px_imputationMethod: !r c("group-median", "median", "mean", "random")[4] | |
| 160 px_kseaCutoffThreshold: !r c(0.05, 0.1, 0.20, 0.35, 0.4, 0.5, 0.999)[5] | |
| 161 pdx_alphaFile: "test-data/alpha_levels.tabular" | |
| 162 pdx_inputFile: "test-data/PDX012970_pST.preproc_tab.tabular" | |
| 163 pdx_preprocDb: "test-data/PDX012970_pST.preproc.sqlite" | |
| 164 pdx_kseaAppPrepDb: !r c(":memory:", "test-data/PDX012970.sqlite")[2] | |
| 165 pdx_regexSampleNames: "\\.\\w+\\.\\w+\\.\\d+[A-Z]$" | |
| 166 pdx_regexSampleGrouping: "\\.\\w+\\K\\.\\w+" | |
| 167 pdx_groupFilterPatterns: "AdCa,AVPC" | |
| 168 pdx_groupFilter: !r c("none", "exclude", "include")[3] | |
| 169 pdx_imputationMethod: !r c("group-median", "median", "mean", "random")[4] | |
| 170 pdx_kseaCutoffThreshold: !r c(0.05, 0.1, 0.25, 0.5, 0.9)[1] | |
| 171 tst_alphaFile: "test-data/alpha_levels.tabular" | |
| 172 tst_inputFile: "test-data/test_input_for_anova.tabular" | |
| 173 tst_preprocDb: "test-data/test_input_for_anova.sqlite" | |
| 174 tst_kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] | |
| 175 tst_regexSampleNames: "\\.\\d+[A-Z]$" | |
| 176 tst_regexSampleGrouping: "\\d+" | |
| 177 tst_groupFilterPatterns: ".+" | |
| 178 tst_groupFilter: !r c("none", "exclude", "include")[1] | |
| 179 tst_kseaCutoffThreshold: !r c(0.05, 0.1, 0.25, 0.5, 0.9)[5] | |
| 180 tst_imputationMethod: !r c("group-median", "median", "mean", "random")[1] | |
| 181 ut_alphaFile: "test-data/alpha_levels.tabular" | |
| 182 ut_inputFile: "test-data/UT_phospho_ST_sites.preproc.tabular" | |
| 183 ut_preprocDb: "test-data/UT_phospho_ST_sites.preproc.sqlite" | |
| 184 ut_kseaAppPrepDb: !r c(":memory:", "test-data/UT_phospho_ST_sites.ksea.sqlite")[2] | |
| 185 ut_regexSampleNames: "\\.\\d+[A-Z]$" | |
| 186 ut_regexSampleGrouping: "\\d+" | |
| 187 ut_groupFilterPatterns: ".+,.*" | |
| 188 ut_groupFilter: !r c("none", "exclude", "include")[1] | |
| 189 ut_imputationMethod: !r c("group-median", "median", "mean", "random")[4] | |
| 42 alphaFile: "test-data/alpha_levels.tabular" | 190 alphaFile: "test-data/alpha_levels.tabular" |
| 43 inputFile: "test-data/test_input_for_anova.tabular" | 191 inputFile: "test-data/test_input_for_anova.tabular" |
| 44 preprocDb: "test-data/test_input_for_anova.sqlite" | 192 preprocDb: "test-data/test_input_for_anova.sqlite" |
| 45 kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] | 193 kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] |
| 46 regexSampleNames: "\\.\\d+[A-Z]$" | 194 regexSampleNames: "\\.\\d+[A-Z]$" |
| 47 regexSampleGrouping: "\\d+" | 195 regexSampleGrouping: "\\d+" |
| 196 groupFilterPatterns: ".+,.*" | |
| 197 groupFilter: !r c("none", "exclude", "include")[1] | |
| 198 imputationMethod: !r c("group-median", "median", "mean", "random")[4] | |
| 199 nd_alphaFile: "test-data/alpha_levels.tabular" | |
| 200 nd_inputFile: "test-data/pST_Sites_NancyDu.txt.preproc.tabular" | |
| 201 nd_preprocDb: "test-data/pST_Sites_NancyDu.txt.preproc.sqlite" | |
| 202 nd_kseaAppPrepDb: !r c(":memory:", "test-data/pST_Sites_NancyDu.ksea.sqlite")[2] | |
| 203 nd_regexSampleNames: "\\.\\d+[A-Z]$" | |
| 204 nd_regexSampleGrouping: "\\d+" | |
| 205 nd_groupFilterPatterns: ".+,.*" | |
| 206 nd_groupFilter: !r c("none", "exclude", "include")[1] | |
| 207 nd_imputationMethod: !r c("group-median", "median", "mean", "random")[4] | |
| 208 pxd_alphaFile: "test-data/alpha_levels.tabular" | |
| 209 pxd_inputFile: "test-data/PDX_pST_by_trt.ppep_intensities.ppep_map.preproc_tab.tabular" | |
| 210 pxd_preprocDb: "test-data/PDX_pST_by_trt.ppep_intensities.ppep_map.preproc_sqlite.sqlite" | |
| 211 pxd_kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] | |
| 212 pxd_regexSampleNames: "\\.\\w+\\.\\d+[A-Z]$" | |
| 213 pxd_regexSampleGrouping: "\\w+" | |
| 214 pxd_groupFilterPatterns: ".+" | |
| 215 pxd_groupFilter: !r c("none", "exclude", "include")[3] | |
| 216 pxd_imputationMethod: !r c("group-median", "median", "mean", "random")[4] | |
| 217 | |
| 218 alphaFile: "test-data/alpha_levels.tabular" | |
| 219 inputFile: "test-data/test_input_for_anova.tabular" | |
| 220 preprocDb: "test-data/test_input_for_anova.sqlite" | |
| 221 kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] | |
| 222 regexSampleNames: "\\.\\d+[A-Z]$" | |
| 223 regexSampleGrouping: "\\d+" | |
| 224 groupFilterPatterns: ".+,.*" | |
| 225 groupFilter: !r c("none", "exclude", "include")[1] | |
| 226 | |
| 227 alphaFile: "test-data/alpha_levels.tabular" | |
| 228 inputFile: "test-data/PDX012970_pST.preproc_tab.tabular" | |
| 229 preprocDb: "test-data/PDX012970_pST.preproc.sqlite" | |
| 230 kseaAppPrepDb: !r c(":memory:", "test-data/PDX012970.sqlite")[2] | |
| 231 regexSampleNames: "\\.\\w+\\.\\w+\\.\\d+[A-Z]$" | |
| 232 regexSampleGrouping: "\\.\\w+\\K\\.\\w+" | |
| 233 groupFilterPatterns: "AdCa,AVPC" | |
| 234 groupFilter: !r c("none", "exclude", "include")[3] | |
| 48 | 235 |
| 49 alphaFile: "test-data/alpha_levels.tabular" | 236 alphaFile: "test-data/alpha_levels.tabular" |
| 50 inputFile: "test-data/PDX_pST_by_trt.ppep_intensities.ppep_map.preproc_tab.tabular" | 237 inputFile: "test-data/PDX_pST_by_trt.ppep_intensities.ppep_map.preproc_tab.tabular" |
| 51 preprocDb: "test-data/PDX_pST_by_trt.ppep_intensities.ppep_map.preproc_sqlite.sqlite" | 238 preprocDb: "test-data/PDX_pST_by_trt.ppep_intensities.ppep_map.preproc_sqlite.sqlite" |
| 52 kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] | 239 kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] |
| 53 regexSampleNames: "\\.\\w+\\.\\d+[A-Z]$" | 240 regexSampleNames: "\\.\\w+\\.\\d+[A-Z]$" |
| 54 regexSampleGrouping: "\\w+" | 241 regexSampleGrouping: "\\w+" |
| 242 groupFilterPatterns: ".+,.*" | |
| 243 groupFilter: !r c("none", "exclude", "include")[3] | |
| 55 | 244 |
| 56 kseaCutoffStatistic: !r c("p.value", "FDR")[2] | 245 kseaCutoffStatistic: !r c("p.value", "FDR")[2] |
| 57 kseaCutoffThreshold: !r c(0.05, 0.1)[1] | 246 kseaCutoffThreshold: !r c(0.05, 0.1)[1] |
| 58 | 247 |
| 59 alphaFile: "test-data/alpha_levels.tabular" | 248 alphaFile: "test-data/alpha_levels.tabular" |
| 67 inputFile: "test-data/pY_Sites_NancyDu.txt.ppep_intensities.ppep_map.preproc.tabular" | 256 inputFile: "test-data/pY_Sites_NancyDu.txt.ppep_intensities.ppep_map.preproc.tabular" |
| 68 preprocDb: "test-data/pY_Sites_NancyDu.txt.ppep_intensities.ppep_map.preproc.sqlite" | 257 preprocDb: "test-data/pY_Sites_NancyDu.txt.ppep_intensities.ppep_map.preproc.sqlite" |
| 69 kseaAppPrepDb: !r c(":memory:", "test-data/pY_Sites_NancyDu.ksea.sqlite")[2] | 258 kseaAppPrepDb: !r c(":memory:", "test-data/pY_Sites_NancyDu.ksea.sqlite")[2] |
| 70 regexSampleNames: "\\.\\d+[A-Z]$" | 259 regexSampleNames: "\\.\\d+[A-Z]$" |
| 71 regexSampleGrouping: "\\d+" | 260 regexSampleGrouping: "\\d+" |
| 261 groupFilterPatterns: ".+,.*" | |
| 262 groupFilter: !r c("none", "exclude", "include")[3] | |
| 72 | 263 |
| 73 alphaFile: "test-data/alpha_levels.tabular" | 264 alphaFile: "test-data/alpha_levels.tabular" |
| 74 inputFile: "test-data/pST_Sites_NancyDu.txt.preproc.tabular" | 265 inputFile: "test-data/pST_Sites_NancyDu.txt.preproc.tabular" |
| 75 preprocDb: "test-data/pST_Sites_NancyDu.txt.preproc.sqlite" | 266 preprocDb: "test-data/pST_Sites_NancyDu.txt.preproc.sqlite" |
| 76 kseaAppPrepDb: !r c(":memory:", "test-data/pST_Sites_NancyDu.ksea.sqlite")[2] | 267 kseaAppPrepDb: !r c(":memory:", "test-data/pST_Sites_NancyDu.ksea.sqlite")[2] |
| 77 regexSampleNames: "\\.\\d+[A-Z]$" | 268 regexSampleNames: "\\.\\d+[A-Z]$" |
| 78 regexSampleGrouping: "\\d+" | 269 regexSampleGrouping: "\\d+" |
| 79 | 270 groupFilterPatterns: ".+,.*" |
| 80 inputFile: "test-data/density_failure.preproc_tab.tabular" | 271 groupFilter: !r c("none", "exclude", "include")[1] |
| 81 kseaAppPrepDb: !r c(":memory:", "mqppep.sqlite")[2] | 272 |
| 82 latex_document: default | |
| 83 --> | 273 --> |
| 84 ```{r setup, include = FALSE} | 274 ```{r setup, include = FALSE, results = 'asis'} |
| 275 | |
| 276 # simple debug messaging | |
| 277 print_nb_messages <- params$printNBMsgs | |
| 278 | |
| 279 nb <- if (!print_nb_messages) { | |
| 280 function(...) invisible() | |
| 281 } else { | |
| 282 function(..., f = cat) f("\n$\\exists{}\\supset\\forall{}$", ...) | |
| 283 } | |
| 284 | |
| 285 nbe <- if (!print_nb_messages) { | |
| 286 function(...) invisible() | |
| 287 } else { | |
| 288 function(..., f = cat, file = stderr()) { | |
| 289 cat( | |
| 290 stringi::stri_unescape_unicode("\nNBE \\u2203\\u2283\\u2200"), | |
| 291 ..., | |
| 292 file = file | |
| 293 ) | |
| 294 } | |
| 295 } | |
| 296 | |
| 85 #ref for debugging: https://yihui.org/tinytex/r/#debugging | 297 #ref for debugging: https://yihui.org/tinytex/r/#debugging |
| 86 options(tinytex.verbose = TRUE) | 298 options(tinytex.verbose = TRUE) |
| 87 | 299 |
| 88 # ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285 | 300 # ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285 |
| 89 # ref for top and bottom struts: https://tex.stackexchange.com/a/50355 | 301 # ref for top and bottom struts: https://tex.stackexchange.com/a/50355 |
| 90 knitr::opts_chunk$set(echo = FALSE, fig.dim = c(9, 10)) | 302 knitr::opts_chunk$set(echo = FALSE, fig.dim = c(9, 10), dpi = 300) |
| 91 | 303 |
| 92 # freeze the random number generator so the same results will be produced | 304 # freeze the random number generator so the same results will be produced |
| 93 # from run to run | 305 # from run to run |
| 94 set.seed(28571) | 306 set.seed(28571) |
| 95 | 307 |
| 96 ### LIBRARIES | 308 ### LIBRARIES |
| 309 | |
| 310 if (print_nb_messages) nbe("library(gplots)") | |
| 97 library(gplots) | 311 library(gplots) |
| 312 if (print_nb_messages) nbe("library(caret)") | |
| 313 # load caret for nearZeroVar | |
| 314 if (print_nb_messages) nbe("Please ignore the messages about systemd, if any.\n") | |
| 315 library(caret) | |
| 316 if (print_nb_messages) nbe("library(DBI)") | |
| 98 library(DBI) | 317 library(DBI) |
| 318 if (print_nb_messages) nbe("library(RSQLite)") | |
| 99 library(RSQLite) | 319 library(RSQLite) |
| 320 if (print_nb_messages) nbe("library(sqldf)\n") | |
| 100 # Suppress "Warning: no DISPLAY variable so Tk is not available" | 321 # Suppress "Warning: no DISPLAY variable so Tk is not available" |
| 101 suppressWarnings(suppressMessages(library(sqldf))) | 322 suppressWarnings(suppressMessages(library(sqldf))) |
| 102 | 323 |
| 103 # required but not added to search list: | 324 # required but not added to search list: |
| 104 # - DBI | 325 # - DBI |
| 110 # - reshape2 | 331 # - reshape2 |
| 111 # - vioplot | 332 # - vioplot |
| 112 | 333 |
| 113 ### CONSTANTS | 334 ### CONSTANTS |
| 114 | 335 |
| 115 const_parfin <- par("fin") | 336 const_boxplot_fill <- "grey94" |
| 116 const_boxplot_fill <- "grey94" | |
| 117 const_stripchart_cex <- 0.5 | |
| 118 const_stripsmall_cex <- | |
| 119 sqrt(const_stripchart_cex * const_stripchart_cex / 2) | |
| 120 const_stripchart_jitter <- 0.3 | |
| 121 const_write_debug_files <- FALSE | |
| 122 const_table_anchor_bp <- "bp" | |
| 123 const_table_anchor_ht <- "ht" | |
| 124 const_table_anchor_p <- "p" | |
| 125 const_table_anchor_tbp <- "tbp" | |
| 126 | |
| 127 | |
| 128 const_ksea_astrsk_kinases <- 1 | 337 const_ksea_astrsk_kinases <- 1 |
| 129 const_ksea_nonastrsk_kinases <- 2 | 338 const_ksea_nonastrsk_kinases <- 2 |
| 130 const_ksea_all_kinases <- 3 | 339 const_ksea_all_kinases <- 3 |
| 131 | 340 const_log10_e <- log10(exp(1)) |
| 132 const_log10_e <- log10(exp(1)) | 341 const_stripchart_cex <- 0.5 |
| 133 | 342 const_stripchart_jitter <- 0.3 |
| 134 ### FUNCTIONS | 343 const_table_anchor_bp <- "bp" |
| 135 | 344 const_table_anchor_ht <- "ht" |
| 136 # from `demo(error.catching)` | 345 const_table_anchor_p <- "p" |
| 346 const_table_anchor_t <- "t" | |
| 347 const_table_anchor_tbp <- "tbp" | |
| 348 | |
| 349 | |
| 350 ### GLOBAL VARIABLES (params) | |
| 351 | |
| 352 ## functions to process params | |
| 353 | |
| 354 is_string_null_or_empty <- function(x) { | |
| 355 # N. B. non-strings are intentionally treated as NULL | |
| 356 if (is.null(x)) | |
| 357 TRUE | |
| 358 else if (!is.character(x)) | |
| 359 TRUE | |
| 360 else x == "" | |
| 361 } | |
| 362 | |
| 137 ##' Catch *and* save both errors and warnings, and in the case of | 363 ##' Catch *and* save both errors and warnings, and in the case of |
| 138 ##' a warning, also keep the computed result. | 364 ##' a warning, also keep the computed result. |
| 365 ##' return result as list(value = ..., warning = ...) | |
| 366 ##' - value will be: | |
| 367 ##' - the result if no exception is thrown | |
| 368 ##' - the exception if an exception is caught | |
| 369 ##' - warning will be a string except perhaps when warning argument is not NULL | |
| 370 ##' | |
| 371 ##' adapted from `demo(error.catching)` | |
| 139 ##' | 372 ##' |
| 140 ##' @title tryCatch both warnings (with value) and errors | 373 ##' @title tryCatch both warnings (with value) and errors |
| 141 ##' @param expr an \R expression to evaluate | 374 ##' @param expr an \R expression to evaluate |
| 142 ##' @return a list with 'value' and 'warning', where | 375 ##' @return a list with 'value' and 'warning', where |
| 143 ##' 'value' may be an error caught. | 376 ##' 'value' may be an error caught. |
| 144 ##' @author Martin Maechler; | 377 ##' @author Martin Maechler; |
| 145 ##' Copyright (C) 2010-2012 The R Core Team | 378 ##' Copyright (C) 2010-2012 The R Core Team |
| 146 try_catch_w_e <- function(expr) { | 379 try_catch_w_e <- |
| 147 wrn <- NULL | 380 function(expr, error = function(e) e, warning = NULL) { |
| 148 # warning handler | 381 wrn <- NULL |
| 149 w_handler <- function(w) { | 382 # warning handler |
| 150 wrn <<- w | 383 w_handler <- |
| 151 invokeRestart("muffleWarning") | 384 if (is.function(warning)) |
| 152 } | 385 warning |
| 153 list( | 386 else |
| 154 value = withCallingHandlers( | 387 function(w) { |
| 155 tryCatch( | 388 wrn <<- w |
| 156 expr, | 389 invokeRestart("muffleWarning") |
| 157 error = function(e) e | 390 } |
| 391 e_handler <- | |
| 392 if (is.function(error)) | |
| 393 error | |
| 394 else | |
| 395 function(e) e | |
| 396 # return result as list(value = ..., warning = ...) | |
| 397 # - value will be: | |
| 398 # - the result if no exception is thrown | |
| 399 # - the exception if an exception is caught | |
| 400 list( | |
| 401 value = withCallingHandlers( | |
| 402 tryCatch( | |
| 403 expr, | |
| 404 error = e_handler | |
| 405 ), | |
| 406 warning = w_handler | |
| 158 ), | 407 ), |
| 159 warning = w_handler | 408 warning = wrn |
| 160 ), | 409 ) |
| 161 warning = wrn | 410 } |
| 162 ) | 411 |
| 163 } | 412 see_kvp <- |
| 164 | 413 function(format, key, value, suffix = "") { |
| 165 | 414 if ( |
| 166 write_debug_file <- function(s) { | 415 !all( |
| 167 if (const_write_debug_files) { | 416 is.character(format), |
| 168 s_path <- sprintf("test-data/%s.txt", deparse(substitute(s))) | 417 is.character(key), |
| 169 print(sprintf("DEBUG writing file %s", spath)) | 418 is.character(value), |
| 419 is.character(suffix) | |
| 420 ) | |
| 421 ) { | |
| 422 cat("all arguments to see_kvp should be character") | |
| 423 knitr::knit_exit() | |
| 424 } | |
| 425 result <- sprintf(format, value) | |
| 426 if (length(result) > 1) { | |
| 427 sprintf( | |
| 428 "%s = c(%s)%s", | |
| 429 whack_underscores(key), | |
| 430 paste(result, collapse = ", "), | |
| 431 suffix | |
| 432 ) | |
| 433 } else { | |
| 434 sprintf( | |
| 435 "%s = %s%s", | |
| 436 key, | |
| 437 result, | |
| 438 suffix | |
| 439 ) | |
| 440 } | |
| 441 } | |
| 442 | |
| 443 see_logical <- | |
| 444 function(x, suffix = "", xprssn = deparse1(substitute(x))) { | |
| 445 result <- as.character(as.logical(x)) | |
| 446 # handle NAs and NaNs | |
| 447 result[is.na(result)] <- "NA" | |
| 448 see_kvp( | |
| 449 format = "%s", | |
| 450 key = xprssn, | |
| 451 value = result, | |
| 452 suffix = suffix | |
| 453 ) | |
| 454 } | |
| 455 | |
| 456 see_numeric <- | |
| 457 function(x, suffix = "", digits = 3, xprssn = deparse1(substitute(x))) { | |
| 458 if (is.numeric(digits) && is.numeric(x)) { | |
| 459 digits <- as.integer(digits) | |
| 460 digits <- min(16, max(0, digits)) | |
| 461 format <- paste0("%0.", as.character(digits), "g") | |
| 462 result <- sprintf(format, x) | |
| 463 see_kvp( | |
| 464 format = "%s", | |
| 465 key = xprssn, | |
| 466 value = result, | |
| 467 suffix = suffix | |
| 468 ) | |
| 469 } | |
| 470 } | |
| 471 | |
| 472 see_character <- | |
| 473 function(x, suffix = "", xprssn = deparse1(substitute(x))) { | |
| 474 if (is.character(x)) { | |
| 475 see_kvp( | |
| 476 format = "%s", | |
| 477 key = xprssn, | |
| 478 value = sprintf("\"%s\"", x), | |
| 479 suffix = suffix | |
| 480 ) | |
| 481 } | |
| 482 } | |
| 483 | |
| 484 see_variable <- | |
| 485 function(x, suffix = "", digits = 3, xprssn = deparse1(substitute(x))) { | |
| 486 if (is.character(x)) { | |
| 487 see_character(x, suffix, xprssn) | |
| 488 } else if (is.numeric(x)) { | |
| 489 see_numeric(x, suffix, digits, xprssn) | |
| 490 } else if (is.logical(x)) { | |
| 491 see_logical(x, suffix, xprssn) | |
| 492 } else { | |
| 493 f <- file("") | |
| 494 sink(f) | |
| 495 str(x) | |
| 496 msg <- paste(readLines(f), collapse = "\n") | |
| 497 sink() | |
| 498 close(f) | |
| 499 paste0( | |
| 500 "see_variable - str(", | |
| 501 xprssn, | |
| 502 "):\n", | |
| 503 msg, "\n" | |
| 504 ) | |
| 505 } | |
| 506 } | |
| 507 | |
| 508 # ref: https://tug.org/texinfohtml/latex2e.html | |
| 509 # LaTeX sets aside the following characters for special purposes. | |
| 510 # For example, the percent sign % is for comments. | |
| 511 # They are called reserved characters or special characters. | |
| 512 # They are all discussed elsewhere in this manual. | |
| 513 # | |
| 514 # $ % & { } _ ~ ^ \ # | |
| 515 # | |
| 516 # If you want a reserved character to be printed as itself, in the text body | |
| 517 # font, for all but the final three characters in that list simply put | |
| 518 # a backslash \ in front of the character. | |
| 519 # Thus, typing \$1.23 will produce $1.23 in your output. | |
| 520 # | |
| 521 # As to the last three characters, to get a tilde in the text body font, | |
| 522 # use \~{} (omitting the curly braces would result in the next character | |
| 523 # receiving a tilde accent). | |
| 524 # Similarly, to get a text body font circumflex use \^{}. | |
| 525 # To get a backslash in the font of the text body enter \textbackslash{}. | |
| 526 whack_math <- | |
| 527 function(v) { | |
| 528 v <- as.character(v) | |
| 529 w <- gsub("\\", "\\textbackslash ", v, fixed = TRUE) | |
| 530 w <- Reduce( | |
| 531 f = function(l, r) { | |
| 532 gsub(r, paste0("\\", r), l, fixed = TRUE) | |
| 533 }, | |
| 534 x = c("#", "$", "%", "&", "{", "}", "_"), | |
| 535 init = w | |
| 536 ) | |
| 537 w <- gsub("^", "\\^{}", w, fixed = TRUE) | |
| 538 return(w) | |
| 539 if (all(v == w)) | |
| 540 v | |
| 541 else | |
| 542 paste0("\\texttt{", w, "}") | |
| 543 } | |
| 544 whack_underscores <- whack_math | |
| 545 | |
| 546 ## dump params to stderr (remove this eventually) | |
| 547 | |
| 548 if (FALSE) nbe(see_variable(params)) | |
| 549 | |
| 550 ## unlist params for eventual output | |
| 551 | |
| 552 param_unlist <- unlist(as.list(params)) | |
| 553 | |
| 554 # no need to whack underscores and dollars because this is verbatim | |
| 555 param_df <- data.frame( | |
| 556 parameter = paste0("\\verb@", names(param_unlist), "@"), | |
| 557 value = paste0( | |
| 558 "\n\\begin{tiny}\n\\verb@", | |
| 559 param_unlist, | |
| 560 "@\n\\end{tiny}" | |
| 561 ) | |
| 562 ) | |
| 563 param_df <- data.frame( | |
| 564 parameter = names(param_unlist), | |
| 565 value = param_unlist | |
| 566 ) | |
| 567 param_df <- param_df[order(param_df$parameter), ] | |
| 568 | |
| 569 ## general output control | |
| 570 | |
| 571 debug_file_base_path <- params$debugFileBasePath | |
| 572 print_trace_messages <- params$printTraceMsgs | |
| 573 show_enriched_substrates <- params$showEnrichedSubstrates | |
| 574 boxplot_varwidth <- params$boxPlotVarWidth | |
| 575 boxplot_notch <- params$boxPlotNotch | |
| 576 | |
| 577 ## parameters for static data | |
| 578 | |
| 579 kinase_name_uprt_lut_bz2 <- params$kinaseNameUprtLutBz2 | |
| 580 kinase_uprt_desc_lut_bz2 <- params$kinaseUprtDescLutBz2 | |
| 581 | |
| 582 ## parameters for input file | |
| 583 | |
| 584 preproc_db <- params$preprocDb | |
| 585 alpha_file <- params$alphaFile | |
| 586 input_file <- params$inputFile | |
| 587 | |
| 588 # First data column - ideally, this could be detected via | |
| 589 # regexSampleNames, but for now leave it as is. | |
| 590 first_data_column <- params$firstDataColumn | |
| 591 fdc_is_integer <- is.integer(first_data_column) | |
| 592 if (fdc_is_integer) { | |
| 593 first_data_column <- as.integer(params$firstDataColumn) | |
| 594 } | |
| 595 | |
| 596 ## parameters for output files | |
| 597 | |
| 598 ksea_app_prep_db <- params$kseaAppPrepDb | |
| 599 imputed_data_filename <- params$imputedDataFilename | |
| 600 imp_qn_lt_data_filenm <- params$imputedQNLTDataFile | |
| 601 anova_ksea_mtdt_file <- params$anovaKseaMetadata | |
| 602 | |
| 603 ## parameters for imputation | |
| 604 | |
| 605 # Imputation method, should be one of | |
| 606 # "random", "group-median", "median", or "mean" | |
| 607 imputation_method <- params$imputationMethod | |
| 608 | |
| 609 # Selection of percentile of logvalue data to set the mean for random number | |
| 610 # generation when using random imputation | |
| 611 mean_percentile <- params$meanPercentile / 100.0 | |
| 612 | |
| 613 # deviation adjustment-factor for random values; real number. | |
| 614 sd_percentile <- params$sdPercentile | |
| 615 | |
| 616 ## parameters for group parsing and filtering | |
| 617 | |
| 618 # Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$" | |
| 619 regex_sample_names <- params$regexSampleNames | |
| 620 # Regular expression to extract Sample Grouping from Sample Name; | |
| 621 # if error occurs, compare smpl_trt vs. sample_name_matches | |
| 622 # to see if groupings/pairs line up | |
| 623 # e.g., "(\\d+)" | |
| 624 | |
| 625 regex_sample_grouping <- params$regexSampleGrouping | |
| 626 # What are the patterns for filtering sample groups? | |
| 627 # How should sample groups be filtered? | |
| 628 # - none: do not filter | |
| 629 # - include: include sample groups matching filter | |
| 630 # - exclude: include sample groups not matching filter | |
| 631 | |
| 632 sample_group_filter <- params$groupFilter | |
| 633 if (grepl("f", params$groupFilterMode, fixed = TRUE)) { | |
| 634 sample_group_filter_perl <- FALSE | |
| 635 sample_group_filter_fixed <- TRUE | |
| 636 } else if (grepl("p", params$groupFilterMode, fixed = TRUE)) { | |
| 637 sample_group_filter_perl <- TRUE | |
| 638 sample_group_filter_fixed <- FALSE | |
| 639 } else { # normal regex | |
| 640 sample_group_filter_perl <- FALSE | |
| 641 sample_group_filter_fixed <- FALSE | |
| 642 } | |
| 643 | |
| 644 sample_group_filter_nocase <- | |
| 645 grepl("i", params$groupFilterMode, fixed = TRUE) | |
| 646 | |
| 647 # What PCRE patterns should be included or excluded | |
| 648 group_filter_patterns_csv <- params$groupFilterPatterns | |
| 649 sample_group_filter_patterns <- strsplit( | |
| 650 x = group_filter_patterns_csv, | |
| 651 split = ",", | |
| 652 fixed = TRUE | |
| 653 )[[1]] | |
| 654 | |
| 655 ## parameters for hypothesis testing | |
| 656 | |
| 657 one_way_all_categories_fname <- params$oneWayManyCategories | |
| 658 | |
| 659 one_way_all_categories <- try_catch_w_e( | |
| 660 match.fun(one_way_all_categories_fname)) | |
| 661 | |
| 662 if (!is.function(one_way_all_categories$value)) { | |
| 663 write("fatal error for parameter oneWayManyCategories:", stderr()) | |
| 664 write(one_way_all_categories$value$message, stderr()) | |
| 665 if (sys.nframe() > 0) { | |
| 666 cat("Cannot continue and quit() failed. Goodbye.") | |
| 667 knitr::knit_exit() | |
| 668 quit(save = "no", status = 1) | |
| 669 } | |
| 670 } | |
| 671 | |
| 672 one_way_all_categories <- one_way_all_categories$value | |
| 673 | |
| 674 one_way_two_categories_fname <- params$oneWayManyCategories | |
| 675 one_way_two_categories <- try_catch_w_e( | |
| 676 match.fun(one_way_two_categories_fname)) | |
| 677 if (!is.function(one_way_two_categories$value)) { | |
| 678 cat("fatal error for parameter oneWayTwoCategories: \n") | |
| 679 cat(one_way_two_categories$value$message, fill = TRUE) | |
| 680 if (sys.nframe() > 0) { | |
| 681 cat("Cannot continue and quit() failed. Goodbye.") | |
| 682 knitr::knit_exit() | |
| 683 quit(save = "no", status = 1) | |
| 684 } | |
| 685 } | |
| 686 one_way_two_categories <- one_way_two_categories$value | |
| 687 | |
| 688 ## parameters for KSEA | |
| 689 | |
| 690 ksea_cutoff_statistic <- params$kseaCutoffStatistic | |
| 691 ksea_cutoff_threshold <- params$kseaCutoffThreshold | |
| 692 ksea_min_substrate_count <- params$kseaMinSubstrateCount | |
| 693 | |
| 694 ## parameters for global variables consumed by functions | |
| 695 | |
| 696 # intensityHeatmapCriteria: !r c("na_count", "p_value")[2] # TODO switch to 1 | |
| 697 # TODO Validate within list | |
| 698 g_intensity_hm_criteria <- params$intensityHeatmapCriteria | |
| 699 if (is_string_null_or_empty(g_intensity_hm_criteria)) { | |
| 700 cat("invalid intensityHeatmapCriteria parameter (must be string)") | |
| 701 knitr::knit_exit() | |
| 702 } | |
| 703 switch( | |
| 704 g_intensity_hm_criteria, | |
| 705 "quality" = NULL, | |
| 706 "na_count" = NULL, | |
| 707 "p_value" = NULL, | |
| 708 { | |
| 709 with( | |
| 710 params, | |
| 711 cat( | |
| 712 sprintf( | |
| 713 "invalid %s (must be %s)", | |
| 714 see_variable(intensityHeatmapCriteria), | |
| 715 "one of quality or na_count or p_value" | |
| 716 ) | |
| 717 ) | |
| 718 ) | |
| 719 knitr::knit_exit() | |
| 720 } | |
| 721 ) | |
| 722 | |
| 723 # intensityHeatmapRows: 50 | |
| 724 # TODO Validate >> 0 < 75 | |
| 725 g_intensity_hm_rows <- params$intensityHeatmapRows | |
| 726 if (!is.integer(g_intensity_hm_rows) || g_intensity_hm_rows < 1) { | |
| 727 cat("invalid intensityHeatmapRows (must be integer > 0)") | |
| 728 knitr::knit_exit() | |
| 729 } | |
| 730 | |
| 731 g_intensity_min_per_class <- params$intensityMinValuesPerGroup | |
| 732 if (!is.integer(g_intensity_min_per_class) || g_intensity_min_per_class < 0) { | |
| 733 cat("invalid intensityMinValuesPerGroup (must be integer > -1") | |
| 734 knitr::knit_exit() | |
| 735 } | |
| 736 | |
| 737 if (is.na(as.logical(g_correlate_substrates <- params$correlateSubstrates))) { | |
| 738 cat("invalid correlateSubstrates (must be TRUE or FALSE)") | |
| 739 knitr::knit_exit() | |
| 740 } | |
| 741 | |
| 742 if (is.na(as.logical(g_filter_cov_var_gt_1 <- params$filterCovVarGT1))) { | |
| 743 cat("invalid filterCovVarGT1 parameter (must be TRUE or FALSE)") | |
| 744 knitr::knit_exit() | |
| 745 } | |
| 746 | |
| 747 # TODO Validate >> 0 < 30 | |
| 748 g_ppep_trunc_n <- params$ppepTruncN | |
| 749 | |
| 750 # TODO Validate >> 0 < 30 | |
| 751 g_subgene_trunc_n <- params$subgeneTruncN | |
| 752 | |
| 753 # TODO Validate >> 0 < 30 | |
| 754 g_sbstr_trunc_n <- params$substTruncN | |
| 755 | |
| 756 | |
| 757 ### OPERATORS | |
| 758 | |
| 759 # Test for exclusion | |
| 760 # ref: https://www.reneshbedre.com/blog/in-operator-r.html | |
| 761 `%notin%` <- Negate(`%in%`) | |
| 762 | |
| 763 # Augmented assignment | |
| 764 # ref: https://www2.cs.arizona.edu/icon/refernce/infix2.htm#aug_assign | |
| 765 `%||:=%` <- function(lvalue, ...) { | |
| 766 pf <- parent.frame() | |
| 767 rvalue <- Reduce(paste0, x = ..., init = lvalue) | |
| 768 assign( | |
| 769 x = as.character(substitute(lvalue)), | |
| 770 value = rvalue, | |
| 771 pos = pf | |
| 772 ) | |
| 773 invisible(rvalue) | |
| 774 } | |
| 775 | |
| 776 ### FUNCTIONS | |
| 777 | |
| 778 no_op <- | |
| 779 function() { | |
| 780 } | |
| 781 # this function is not used in this file and should be removed while | |
| 782 # factoring out reusable code | |
| 783 all_apply <- function(f, v, na_rm = TRUE, ...) { | |
| 784 Reduce( | |
| 785 f = function(l, r) if (na_rm && is.na(r)) TRUE else l && r, | |
| 786 x = sapply(X = v, FUN = f, ...), | |
| 787 init = TRUE | |
| 788 ) | |
| 789 } | |
| 790 | |
| 791 write_debug_file <- function(data_frame) { | |
| 792 if (!is.null(debug_file_base_path)) { | |
| 793 s_path <- | |
| 794 sprintf( | |
| 795 "%s/%s.txt", | |
| 796 debug_file_base_path, | |
| 797 deparse(substitute(data_frame)) | |
| 798 ) | |
| 170 write.table( | 799 write.table( |
| 171 s, | 800 data_frame, |
| 172 file = s_path, | 801 file = s_path, |
| 173 sep = "\t", | 802 sep = "\t", |
| 174 col.names = TRUE, | 803 col.names = TRUE, |
| 175 row.names = TRUE, | 804 row.names = TRUE, |
| 176 quote = FALSE | 805 quote = FALSE |
| 187 # Hence, `x <- 1; get("x", new_env())` fails by design. | 816 # Hence, `x <- 1; get("x", new_env())` fails by design. |
| 188 new_env <- function() { | 817 new_env <- function() { |
| 189 new.env(parent = emptyenv()) | 818 new.env(parent = emptyenv()) |
| 190 } | 819 } |
| 191 | 820 |
| 821 # make apply readable for rows | |
| 822 row_apply <- function(x, fun, ..., simplify = TRUE) { | |
| 823 apply(x, MARGIN = 1, fun, ..., simplify = TRUE) | |
| 824 } | |
| 825 | |
| 826 # make apply readable for columns | |
| 827 column_apply <- function(x, fun, ..., simplify = TRUE) { | |
| 828 apply(x, MARGIN = 2, fun, ..., simplify = TRUE) | |
| 829 } | |
| 830 | |
| 831 ##' Produce a vector of boolean values whose i-th value is TRUE when any | |
| 832 ##' member of v matches the i-th membr of s, where i in 1:seq_len(length(s)) | |
| 833 ##' | |
| 834 ##' @title Search multiple strings for matches of multiple substrings | |
| 835 ##' @param v a vector of substrings to match | |
| 836 ##' @param s a vector of strings to search for matches | |
| 837 ##' @param ... additional arguments to grepl | |
| 838 ##' @return a list with keys in s and valuse that are vectors of elements of v | |
| 839 ##' @author Art Eschenlauer | |
| 840 ##' Copyright (C) 2022 Art Eschenlauer; | |
| 841 ##' MIT License; https://en.wikipedia.org/wiki/MIT_License#License_terms | |
| 842 mgrepl <- function(v, s, ...) { | |
| 843 grpl_rslt <- rep_len(0, length(s)) | |
| 844 for (vi in v) { | |
| 845 grpl_rslt_v <- sapply( | |
| 846 X = s, | |
| 847 FUN = function(t) { | |
| 848 Reduce( | |
| 849 f = function(l, r) if (is.null(l)) r else c(l, r), | |
| 850 x = sapply( | |
| 851 X = vi, | |
| 852 FUN = function(f) grepl(f, t, ...) | |
| 853 ), | |
| 854 init = c() | |
| 855 ) | |
| 856 }, | |
| 857 simplify = "array" | |
| 858 ) | |
| 859 grpl_rslt <- grpl_rslt + grpl_rslt_v | |
| 860 } | |
| 861 rslt <- unname(grpl_rslt > 0) | |
| 862 } | |
| 863 | |
| 864 ##' Produce positions in a vector where succeeding value != current valus | |
| 865 ##' | |
| 866 ##' @title Search vector for neighboring positions having different values | |
| 867 ##' @param v a vector of comparable numeric values (e.g. integers) | |
| 868 ##' @return a vector of positions i where v[i] != v[i + 1] | |
| 869 ##' @author Art Eschenlauer | |
| 870 ##' Copyright (C) 2022 Art Eschenlauer; | |
| 871 ##' MIT License; https://en.wikipedia.org/wiki/MIT_License#License_terms | |
| 872 transition_positions <- function(v) { | |
| 873 Reduce( | |
| 874 f = function(l, i) if ((i != 1) && (v[i - 1] != v[i])) c(l, i - 1) else l, | |
| 875 x = seq_along(v)[-1:0], | |
| 876 init = c() | |
| 877 ) | |
| 878 } | |
| 879 | |
| 880 ### figure debug functions | |
| 881 | |
| 882 cat_par_vector <- function(par_name, lbl = "", newlines = TRUE) { | |
| 883 cat( | |
| 884 sprintf( | |
| 885 "%spar(%s) = c(%s)%s", | |
| 886 lbl, | |
| 887 par_name, | |
| 888 paste(par(par_name), collapse = ", "), | |
| 889 if (newlines) "\n\n" else "" | |
| 890 ) | |
| 891 ) | |
| 892 } | |
| 893 | |
| 894 cat_margins <- function(lbl = NULL) { | |
| 895 for (p in c("fig", "fin", "mar", "mai", "omd", "omi", "oma")) | |
| 896 cat_par_vector(p, if (!is.null(lbl)) paste0(lbl, " ") else NULL) | |
| 897 } | |
| 898 | |
| 899 cat_variable <- | |
| 900 function(x, suffix = "", digits = 3, force_str = FALSE) { | |
| 901 xprssn <- deparse1(substitute(x)) | |
| 902 if (force_str || is.matrix(x) || is.list(x) || is.data.frame(x)) { | |
| 903 cat( | |
| 904 paste0( | |
| 905 "\n\\texttt{\\textbf{", | |
| 906 whack_underscores(xprssn), | |
| 907 "}} [", | |
| 908 typeof(x), | |
| 909 ",", | |
| 910 mode(x), | |
| 911 "] =\n" | |
| 912 ) | |
| 913 ) | |
| 914 cat("\n\\begin{verbatim}\n") | |
| 915 str(x) | |
| 916 cat("\n\\end{verbatim}\n") | |
| 917 } else { | |
| 918 cat("\n", see_variable(x, suffix, digits, xprssn)) | |
| 919 } | |
| 920 } | |
| 921 | |
| 922 ### structure helper functions | |
| 923 | |
| 924 # ref: staque.R - Icon-oriented stack and queue operations | |
| 925 # - https://gist.github.com/eschen42/917690355e53918b9e7ba7138a02d1f8 | |
| 926 # | |
| 927 # sq_get(v):x produces the leftmost element of v and removes it from v, | |
| 928 # but produces NA if v is empty | |
| 929 sq_get <- function(v) { | |
| 930 if (length(v) == 0) return(NA) | |
| 931 assign(as.character(substitute(v)), v[-1], parent.frame()) | |
| 932 return(v[1]) | |
| 933 } | |
| 934 # | |
| 935 # sq_put(v,x1,...,xn):v puts x1, x2, ..., xn onto the right end of v, | |
| 936 # producing v. | |
| 937 # Values are pushed in order from left to right, | |
| 938 # so xn becomes the last (rightmost) value on v. | |
| 939 # sq_put(v) with no second argument does nothing. | |
| 940 sq_put <- function(v, x = NA, ...) { | |
| 941 pf <- parent.frame() | |
| 942 if (is.null(x)) return(pf$v) | |
| 943 if ( | |
| 944 !(length(x) > 1) && | |
| 945 !rlang::is_closure(x) && | |
| 946 is.na(x) | |
| 947 ) return(pf$v) | |
| 948 assign(as.character(substitute(v)), c(v, x, ...), pf) | |
| 949 pf[[as.character(substitute(v))]] | |
| 950 } | |
| 951 | |
| 192 ### numerical/statistical helper functions | 952 ### numerical/statistical helper functions |
| 193 | 953 |
| 194 any_nan <- function(x) { | 954 any_nan <- function(x) { |
| 195 !any(x == "NaN") | 955 !any(x == "NaN") |
| 196 } | 956 } |
| 199 sd_finite <- function(x) { | 959 sd_finite <- function(x) { |
| 200 ok <- is.finite(x) | 960 ok <- is.finite(x) |
| 201 sd(x[ok]) | 961 sd(x[ok]) |
| 202 } | 962 } |
| 203 | 963 |
| 964 # compute anova raw p-value | |
| 204 anova_func <- function(x, grouping_factor, one_way_f) { | 965 anova_func <- function(x, grouping_factor, one_way_f) { |
| 205 subject <- data.frame( | 966 subject <- data.frame( |
| 206 intensity = x | 967 intensity = x |
| 207 ) | 968 ) |
| 208 x_aov <- | 969 x_aov <- |
| 216 else | 977 else |
| 217 pvalue <- x_aov$p.value | 978 pvalue <- x_aov$p.value |
| 218 pvalue | 979 pvalue |
| 219 } | 980 } |
| 220 | 981 |
| 982 # This code adapted from matrixcalc::is.positive.definite | |
| 983 # Notably, this simply tests without calling stop() | |
| 984 is_positive_definite <- function(x, tol = 1e-08) { | |
| 985 if (!is.matrix(x)) | |
| 986 return(FALSE) | |
| 987 if (!is.numeric(x)) | |
| 988 return(FALSE) | |
| 989 if (nrow(x) < 1) | |
| 990 return(FALSE) | |
| 991 if (ncol(x) < 1) | |
| 992 return(FALSE) | |
| 993 if (nrow(x) != ncol(x)) | |
| 994 return(FALSE) | |
| 995 sum_symm <- sum(x == t(x), na.rm = TRUE) | |
| 996 value_count <- Reduce("*", dim(x)) | |
| 997 if (sum_symm != value_count) | |
| 998 return(FALSE) | |
| 999 eigenvalues <- eigen(x, only.values = TRUE)$values | |
| 1000 n <- nrow(x) | |
| 1001 for (i in 1:n) { | |
| 1002 if (abs(eigenvalues[i]) < tol) { | |
| 1003 eigenvalues[i] <- 0 | |
| 1004 } | |
| 1005 } | |
| 1006 if (any(eigenvalues <= 0)) { | |
| 1007 return(FALSE) | |
| 1008 } | |
| 1009 return(TRUE) | |
| 1010 } | |
| 221 | 1011 |
| 222 ### LaTeX functions | 1012 ### LaTeX functions |
| 223 | 1013 |
| 224 latex_collapsed_vector <- function(collapse_string, v, underscore_whack = TRUE) { | |
| 225 v_sub <- if (underscore_whack) gsub("_", "\\\\_", v) else v | |
| 226 cat( | |
| 227 paste0( | |
| 228 v_sub, | |
| 229 collapse = collapse_string | |
| 230 ) | |
| 231 ) | |
| 232 } | |
| 233 | |
| 234 latex_itemized_collapsed <- function(collapse_string, v, underscore_whack = TRUE) { | |
| 235 cat("\\begin{itemize}\n\\item ") | |
| 236 latex_collapsed_vector(collapse_string, v, underscore_whack) | |
| 237 cat("\n\\end{itemize}\n") | |
| 238 } | |
| 239 | |
| 240 latex_itemized_list <- function(v, underscore_whack = TRUE) { | |
| 241 latex_itemized_collapsed("\n\\item ", v, underscore_whack) | |
| 242 } | |
| 243 | |
| 244 latex_enumerated_collapsed <- function(collapse_string, v, underscore_whack = TRUE) { | |
| 245 cat("\\begin{enumerate}\n\\item ") | |
| 246 latex_collapsed_vector(collapse_string, v, underscore_whack) | |
| 247 cat("\n\\end{enumerate}\n") | |
| 248 } | |
| 249 | |
| 250 latex_enumerated_list <- function(v) { | |
| 251 latex_enumerated_collapsed("\n\\item ", v) | |
| 252 } | |
| 253 | |
| 254 latex_table_row <- function(v, extra = "", underscore_whack = TRUE) { | |
| 255 latex_collapsed_vector(" & ", v, underscore_whack) | |
| 256 cat(extra) | |
| 257 cat(" \\\\\n") | |
| 258 } | |
| 259 | |
| 260 # Use this like print.data.frame, from which it is adapted: | 1014 # Use this like print.data.frame, from which it is adapted: |
| 261 data_frame_latex <- | 1015 data_frame_table_latex <- |
| 262 function( | 1016 function( |
| 263 x, | 1017 x, |
| 264 ..., | |
| 265 # digits to pass to format.data.frame | 1018 # digits to pass to format.data.frame |
| 266 digits = NULL, | 1019 digits = NULL, |
| 267 # TRUE -> right-justify columns; FALSE -> left-justify | 1020 # TRUE -> right-justify columns; FALSE -> left-justify |
| 268 right = TRUE, | 1021 right = TRUE, |
| 269 # maximumn number of rows to print | 1022 # maximumn number of rows to print |
| 275 # optional caption | 1028 # optional caption |
| 276 caption = NULL, | 1029 caption = NULL, |
| 277 # h(inline); b(bottom); t (top) or p (separate page) | 1030 # h(inline); b(bottom); t (top) or p (separate page) |
| 278 anchor = "h", | 1031 anchor = "h", |
| 279 # set underscore_whack to TRUE to escape underscores | 1032 # set underscore_whack to TRUE to escape underscores |
| 280 underscore_whack = TRUE | 1033 underscore_whack = TRUE, |
| 1034 # how to emit results | |
| 1035 emit = cat | |
| 281 ) { | 1036 ) { |
| 282 if (is.null(justification)) | 1037 if (is.null(justification)) |
| 283 justification <- | 1038 justification <- |
| 284 Reduce( | 1039 Reduce( |
| 285 f = paste, | 1040 f = paste, |
| 286 x = rep_len(if (right) "r" else "l", length(colnames(x))) | 1041 x = rep_len(if (right) "r" else "l", length(colnames(x))) |
| 287 ) | 1042 ) |
| 1043 n <- length(rownames(x)) | |
| 1044 if (length(x) == 0L) { | |
| 1045 emit( | |
| 1046 sprintf( | |
| 1047 # if n is one, use singular 'row', else use plural 'rows' | |
| 1048 ngettext( | |
| 1049 n, | |
| 1050 "data frame with 0 columns and %d row", | |
| 1051 "data frame with 0 columns and %d rows" | |
| 1052 ), | |
| 1053 n | |
| 1054 ), | |
| 1055 "\n", | |
| 1056 sep = "" | |
| 1057 ) | |
| 1058 } else if (n == 0L) { | |
| 1059 emit("0 rows for:\n") | |
| 1060 latex_itemized_list( | |
| 1061 v = names(x), | |
| 1062 underscore_whack = underscore_whack | |
| 1063 ) | |
| 1064 } else { | |
| 1065 if (is.null(max)) | |
| 1066 max <- getOption("max.print", 99999L) | |
| 1067 if (!is.finite(max)) { | |
| 1068 cat("Abend because: invalid 'max' / getOption(\"max.print\"): ", max) | |
| 1069 knitr::knit_exit() | |
| 1070 } | |
| 1071 omit <- (n0 <- max %/% length(x)) < n | |
| 1072 m <- as.matrix( | |
| 1073 format.data.frame( | |
| 1074 if (omit) x[seq_len(n0), , drop = FALSE] else x, | |
| 1075 digits = digits, | |
| 1076 na.encode = FALSE | |
| 1077 ) | |
| 1078 ) | |
| 1079 emit( | |
| 1080 # h(inline); b(bottom); t (top) or p (separate page) | |
| 1081 paste0("\\begin{table}[", anchor, "]"), | |
| 1082 "\\leavevmode", | |
| 1083 sep = "\n" | |
| 1084 ) | |
| 1085 if (!is.null(caption)) | |
| 1086 emit(paste0(" \\caption{", caption, "}")) | |
| 1087 if (centered) emit("\\centering\n") | |
| 1088 emit( | |
| 1089 paste( | |
| 1090 " \\begin{tabular}{", | |
| 1091 justification, | |
| 1092 "}\n", | |
| 1093 sep = "" | |
| 1094 ) | |
| 1095 ) | |
| 1096 | |
| 1097 # ref for top and bottom struts (\T and \B): | |
| 1098 # https://tex.stackexchange.com/a/50355 | |
| 1099 if (!is.null(caption)) | |
| 1100 emit("\\B \\\\\n") | |
| 1101 latex_table_row( | |
| 1102 v = colnames(m), | |
| 1103 extra = " \\T \\B", | |
| 1104 underscore_whack = underscore_whack | |
| 1105 ) | |
| 1106 emit("\\hline \\\\\n") | |
| 1107 for (i in seq_len(length(m[, 1]))) { | |
| 1108 latex_table_row( | |
| 1109 v = m[i, ], | |
| 1110 underscore_whack = underscore_whack | |
| 1111 ) | |
| 1112 } | |
| 1113 emit( | |
| 1114 paste( | |
| 1115 " \\end{tabular}", | |
| 1116 "\\end{table}", | |
| 1117 sep = "\n" | |
| 1118 ) | |
| 1119 ) | |
| 1120 if (omit) | |
| 1121 emit(" [ reached 'max' / getOption(\"max.print\") -- omitted", | |
| 1122 n - n0, "rows ]\n") | |
| 1123 } | |
| 1124 invisible(x) | |
| 1125 } | |
| 1126 | |
| 1127 # Use this like print.data.frame, from which it is adapted: | |
| 1128 data_frame_tabbing_latex <- | |
| 1129 function( | |
| 1130 x, | |
| 1131 # vector of tab stops, in inches | |
| 1132 tabstops, | |
| 1133 # vector of headings, registered with tab-stops | |
| 1134 headings = colnames(x), | |
| 1135 # digits to pass to format.data.frame | |
| 1136 digits = NULL, | |
| 1137 # maximumn number of rows to print | |
| 1138 max = NULL, | |
| 1139 # optional caption | |
| 1140 caption = NULL, | |
| 1141 # set underscore_whack to TRUE to escape underscores | |
| 1142 underscore_whack = TRUE, | |
| 1143 # flag for landscape mode | |
| 1144 landscape = FALSE, | |
| 1145 # flag indicating that subsubsection should be used for caption | |
| 1146 # rather than subsection | |
| 1147 use_subsubsection_header = TRUE, | |
| 1148 # character-size indicator; for possible values, see: | |
| 1149 # https://tug.org/texinfohtml/latex2e.html#Font-sizes | |
| 1150 charactersize = "small", | |
| 1151 # set verbatim to TRUE to debug output | |
| 1152 verbatim = FALSE | |
| 1153 ) { | |
| 1154 | |
| 1155 hlinport <- if (landscape) { | |
| 1156 function() cat("\\hlinlscp \\\\\n") | |
| 1157 } else { | |
| 1158 function() cat("\\hlinport \\\\\n") | |
| 1159 } | |
| 1160 | |
| 1161 tabstops_tex <- | |
| 1162 Reduce( | |
| 1163 f = function(l, r) paste0(l, r), | |
| 1164 x = sprintf("\\hspace{%0.2fin}\\=", tabstops), | |
| 1165 init = "" | |
| 1166 ) | |
| 1167 | |
| 288 n <- length(rownames(x)) | 1168 n <- length(rownames(x)) |
| 289 if (length(x) == 0L) { | 1169 if (length(x) == 0L) { |
| 290 cat( | 1170 cat( |
| 291 sprintf( | 1171 sprintf( |
| 292 # if n is one, use singular 'row', else use plural 'rows' | 1172 # if n is one, use singular 'row', else use plural 'rows' |
| 307 underscore_whack = underscore_whack | 1187 underscore_whack = underscore_whack |
| 308 ) | 1188 ) |
| 309 } else { | 1189 } else { |
| 310 if (is.null(max)) | 1190 if (is.null(max)) |
| 311 max <- getOption("max.print", 99999L) | 1191 max <- getOption("max.print", 99999L) |
| 312 if (!is.finite(max)) | 1192 if (!is.finite(max)) { |
| 313 stop("invalid 'max' / getOption(\"max.print\"): ", | 1193 cat("Abend because: invalid 'max' / getOption(\"max.print\"): ", max) |
| 314 max) | 1194 knitr::knit_exit() |
| 1195 } | |
| 315 omit <- (n0 <- max %/% length(x)) < n | 1196 omit <- (n0 <- max %/% length(x)) < n |
| 316 m <- as.matrix( | 1197 m <- as.matrix( |
| 317 format.data.frame( | 1198 format.data.frame( |
| 318 if (omit) x[seq_len(n0), , drop = FALSE] else x, | 1199 if (omit) x[seq_len(n0), , drop = FALSE] else x, |
| 319 digits = digits, | 1200 digits = digits, |
| 320 na.encode = FALSE | 1201 na.encode = FALSE |
| 321 ) | 1202 ) |
| 322 ) | 1203 ) |
| 323 cat( | 1204 if (landscape) |
| 324 # h(inline); b(bottom); t (top) or p (separate page) | 1205 cat("\n\\begin{landscape}") |
| 325 paste0("\\begin{table}[", anchor, "]\n") | 1206 tex_caption <- |
| 326 ) | 1207 if (!is.null(caption)) sprintf("\\captionof{table}{%s}\n", caption) |
| 327 if (!is.null(caption)) | 1208 else "\n" |
| 328 cat(paste0(" \\caption{", caption, "}")) | 1209 # build the column names, which have multiple lines when |
| 329 if (centered) cat("\\centering\n") | 1210 # length(headings) is a multiple of the number of columns |
| 330 cat( | 1211 column_names <- "" |
| 331 paste( | 1212 while (length(headings) > 0) { |
| 332 " \\begin{tabular}{", | 1213 my_row <- c() |
| 333 justification, | 1214 for (i in 1:(1 + length(tabstops))) { |
| 334 "}\n", | 1215 my_field <- sq_get(headings) |
| 335 sep = "" | 1216 sq_put(my_row, if (is.na(my_field)) "" else my_field) |
| 336 ) | 1217 } |
| 337 ) | 1218 column_names %||:=% latex_tabbing_row( |
| 338 # ref: https://tex.stackexchange.com/a/50353 | 1219 v = my_row, |
| 339 # Describes use of \rule{0pt}{3ex} | 1220 underscore_whack = underscore_whack, |
| 340 if (!is.null(caption)) | 1221 action = paste0 |
| 341 cat("\\B \\\\ \\hline\\hline\n") | |
| 342 # ref for top and bottom struts: https://tex.stackexchange.com/a/50355 | |
| 343 latex_table_row( | |
| 344 v = colnames(m), | |
| 345 extra = "\\T\\B", | |
| 346 underscore_whack = underscore_whack | |
| 347 ) | |
| 348 cat("\\hline\n") | |
| 349 for (i in seq_len(length(m[, 1]))) { | |
| 350 latex_table_row( | |
| 351 v = m[i, ], | |
| 352 underscore_whack = underscore_whack | |
| 353 ) | 1222 ) |
| 354 } | 1223 } |
| 1224 | |
| 1225 # Begin tabbing environment after beginning charactersize environment | |
| 1226 if (verbatim) cat("\n\\begin{verbatim}") | |
| 355 cat( | 1227 cat( |
| 356 paste( | 1228 paste0( |
| 357 " \\end{tabular}", | 1229 "\n\\begin{", charactersize, "}", tex_caption, |
| 358 "\\end{table}", | 1230 "\\begin{tabwrap}{", tabstops_tex, "}\n" |
| 359 sep = "\n" | |
| 360 ) | |
| 361 ) | 1231 ) |
| 1232 ) | |
| 1233 # emit column names | |
| 1234 cat(column_names) | |
| 1235 # emit hline | |
| 1236 hlinport() | |
| 1237 for (i in seq_len(length(m[, 1]))) { | |
| 1238 my_row <- latex_tabbing_row( | |
| 1239 v = m[i, ], | |
| 1240 underscore_whack = underscore_whack, | |
| 1241 action = paste0 | |
| 1242 ) | |
| 1243 if (FALSE) | |
| 1244 cat(my_row) | |
| 1245 else | |
| 1246 cat(my_row) | |
| 1247 } | |
| 1248 hlinport() | |
| 362 if (omit) | 1249 if (omit) |
| 363 cat(" [ reached 'max' / getOption(\"max.print\") -- omitted", | 1250 cat(" [ reached 'max' / getOption(\"max.print\") -- omitted", |
| 364 n - n0, "rows ]\n") | 1251 n - n0, "rows ]\n") |
| 1252 # End charactersize environment after ending tabbing environment | |
| 1253 cat(paste0("\\end{tabwrap}\n\\end{", charactersize, "}\n")) | |
| 1254 if (verbatim) cat("\\end{verbatim}\n") | |
| 1255 if (landscape) | |
| 1256 cat("\\end{landscape}\n") | |
| 365 } | 1257 } |
| 366 invisible(x) | 1258 invisible(x) |
| 1259 } | |
| 1260 | |
| 1261 param_df_noexit <- | |
| 1262 function(e = NULL) { | |
| 1263 data_frame_tabbing_latex( | |
| 1264 x = param_df, | |
| 1265 tabstops = c(1.75), | |
| 1266 underscore_whack = TRUE, | |
| 1267 caption = "Input parameters", | |
| 1268 verbatim = FALSE | |
| 1269 ) | |
| 1270 if (!is.null(e)) { | |
| 1271 sink(stderr()) | |
| 1272 cat("Caught fatal error:\n\n") | |
| 1273 str(e) | |
| 1274 sink() | |
| 1275 } | |
| 1276 } | |
| 1277 | |
| 1278 param_df_exit <- | |
| 1279 function(e = NULL) { | |
| 1280 param_df_noexit(e) | |
| 1281 knitr::knit_exit() | |
| 1282 exit(-1) | |
| 1283 } | |
| 1284 | |
| 1285 # exit with exit code (default 0) and optional msg | |
| 1286 exit <- | |
| 1287 function(code = 0, msg = NULL, use_stderr = FALSE) { | |
| 1288 if (!is.null(msg)) { | |
| 1289 if (use_stderr) sink(stderr()) | |
| 1290 cat("\n\n", msg, "\n\n") | |
| 1291 if (use_stderr) sink() | |
| 1292 } | |
| 1293 q(save = "no", status = code) | |
| 1294 } | |
| 1295 | |
| 1296 # make control sequences into printable latex sequences | |
| 1297 latex_printable_control_seqs <- | |
| 1298 function(s) { | |
| 1299 s <- gsub("[\\]", "xyzzy_plugh", s) | |
| 1300 s <- gsub("[$]", "\\\\$", s) | |
| 1301 s <- gsub("xyzzy_plugh", "$\\\\backslash$", s) | |
| 1302 return(s) | |
| 1303 } | |
| 1304 nolatex_verbatim <- | |
| 1305 function(expr) eval(expr) | |
| 1306 | |
| 1307 latex_verbatim <- | |
| 1308 function(expr) { | |
| 1309 arg_string <- deparse1(substitute(expr)) | |
| 1310 cat("\n\\begin{verbatim}\n___\n") | |
| 1311 tryCatch( | |
| 1312 expr = expr, | |
| 1313 error = param_df_exit, | |
| 1314 #ACE error = | |
| 1315 #ACE function(e) { | |
| 1316 #ACE cat("Caught error:\n\n") | |
| 1317 #ACE str(e) | |
| 1318 #ACE knitr::knit_exit() | |
| 1319 #ACE stop(e) | |
| 1320 #ACE }, | |
| 1321 finally = cat("...\n\\end{verbatim}\n") | |
| 1322 ) | |
| 1323 } | |
| 1324 | |
| 1325 latex_samepage <- | |
| 1326 function(expr) { | |
| 1327 arg_string <- deparse1(substitute(expr)) | |
| 1328 cat("\n\\begin{samepage}\n") | |
| 1329 tryCatch( | |
| 1330 expr = expr, | |
| 1331 error = param_df_exit, | |
| 1332 #ACE error = | |
| 1333 #ACE function(e) { | |
| 1334 #ACE cat("Caught error:\n\n") | |
| 1335 #ACE str(e) | |
| 1336 #ACE knitr::knit_exit() | |
| 1337 #ACE stop(e) | |
| 1338 #ACE }, | |
| 1339 finally = cat("\n\\end{samepage}\n") | |
| 1340 ) | |
| 1341 } | |
| 1342 | |
| 1343 # return the result of invocation after showing parameters | |
| 1344 # ref: https://www.r-bloggers.com/2013/08/a-new-r-trick-for-me-at-least/ | |
| 1345 latex_show_invocation <- | |
| 1346 function(f, f_name = deparse1(substitute(f)), head_patch = FALSE) { | |
| 1347 function(...) { | |
| 1348 my_env <- (as.list(environment())) | |
| 1349 va <- list(...) | |
| 1350 my_rslt <- new_env() | |
| 1351 my_rslt$rslt <- NULL | |
| 1352 latex_verbatim( | |
| 1353 expr = { | |
| 1354 cat(sprintf("\n .. Local variables for '%s':\n\n", f_name)) | |
| 1355 str(va) | |
| 1356 if (!head_patch) { | |
| 1357 # return this result | |
| 1358 # ref: https://www.r-bloggers.com/2013/08/a-new-r-trick-for-me-at-least/ | |
| 1359 cat(sprintf("\n .. Invoking '%s'\n", f_name)) | |
| 1360 tryCatch( | |
| 1361 { | |
| 1362 cat("\n\\end{verbatim}\n") | |
| 1363 rslt <- do.call(f, va) | |
| 1364 }, | |
| 1365 error = param_df_exit, | |
| 1366 #ACE error = function(e) { | |
| 1367 #ACE cat("\n\\begin{verbatim}\n") | |
| 1368 #ACE str(e) | |
| 1369 #ACE cat("\n\\end{verbatim}\n") | |
| 1370 #ACE knitr::knit_exit() | |
| 1371 #ACE stop(e) | |
| 1372 #ACE }, | |
| 1373 finally = cat("\n\\begin{verbatim}\n") | |
| 1374 ) | |
| 1375 cat(sprintf("\n .. '%s' returned:\n", f_name)) | |
| 1376 str(rslt) | |
| 1377 my_rslt$rslt <- rslt | |
| 1378 } | |
| 1379 } | |
| 1380 ) | |
| 1381 # return the result of invocation with the shown parameters | |
| 1382 # ref: https://www.r-bloggers.com/2013/08/a-new-r-trick-for-me-at-least/ | |
| 1383 if (head_patch) my_rslt$rslt <- do.call(f, va) | |
| 1384 (my_rslt$rslt) | |
| 1385 } | |
| 1386 } | |
| 1387 | |
| 1388 latex_collapsed_vector <- function( | |
| 1389 collapse_string, | |
| 1390 v, | |
| 1391 underscore_whack = TRUE, | |
| 1392 action = cat0 | |
| 1393 ) { | |
| 1394 v_sub <- | |
| 1395 if (underscore_whack) whack_underscores(v) else v | |
| 1396 action( | |
| 1397 paste0( | |
| 1398 v_sub, | |
| 1399 collapse = collapse_string | |
| 1400 ) | |
| 1401 ) | |
| 1402 } | |
| 1403 | |
| 1404 latex_itemized_collapsed <- function(collapse_string, v, underscore_whack = TRUE) { | |
| 1405 cat("\\begin{itemize}\n\\item ") | |
| 1406 latex_collapsed_vector(collapse_string, v, underscore_whack) | |
| 1407 cat("\n\\end{itemize}\n") | |
| 1408 } | |
| 1409 | |
| 1410 latex_itemized_list <- function(v, underscore_whack = TRUE) { | |
| 1411 latex_itemized_collapsed("\n\\item ", v, underscore_whack) | |
| 1412 } | |
| 1413 | |
| 1414 latex_enumerated_collapsed <- function(collapse_string, v, underscore_whack = TRUE) { | |
| 1415 cat("\\begin{enumerate}\n\\item ") | |
| 1416 latex_collapsed_vector(collapse_string, v, underscore_whack) | |
| 1417 cat("\n\\end{enumerate}\n") | |
| 1418 } | |
| 1419 | |
| 1420 latex_enumerated_list <- function(v) { | |
| 1421 latex_enumerated_collapsed("\n\\item ", v) | |
| 1422 } | |
| 1423 | |
| 1424 latex_table_row <- function(v, extra = "", underscore_whack = TRUE) { | |
| 1425 latex_collapsed_vector(" & ", v, underscore_whack) | |
| 1426 cat(extra) | |
| 1427 cat(" \\\\\n") | |
| 1428 } | |
| 1429 | |
| 1430 latex_tabbing_row <- function( | |
| 1431 v, | |
| 1432 extra = "", | |
| 1433 underscore_whack = TRUE, | |
| 1434 action = cat0 | |
| 1435 ) { | |
| 1436 # latex_collapsed_vector applies action to result of paste0; | |
| 1437 # by default, action = cat; | |
| 1438 # hence, a scalar string is assigned to v_collapsed | |
| 1439 v_collapsed <- | |
| 1440 latex_collapsed_vector( | |
| 1441 "} \\> \\tabfill{", | |
| 1442 v, | |
| 1443 underscore_whack, | |
| 1444 action = paste0 | |
| 1445 ) | |
| 1446 action( | |
| 1447 "\\tabfill{", | |
| 1448 v_collapsed, | |
| 1449 "}", | |
| 1450 extra, | |
| 1451 " \\\\\n" | |
| 1452 ) | |
| 1453 } | |
| 1454 | |
| 1455 # N.B. use con = "" to emulate regular cat | |
| 1456 fcat0 <- | |
| 1457 function(..., sprtr = " ", cnnctn = file()) { | |
| 1458 cat0(..., sep = sprtr, file = cnnctn) | |
| 1459 invisible(cnnctn) | |
| 367 } | 1460 } |
| 368 | 1461 |
| 369 hypersub <- | 1462 hypersub <- |
| 370 function(s) { | 1463 function(s) { |
| 371 hyper <- tolower(s) | 1464 hyper <- tolower(s) |
| 372 hyper <- gsub("[^a-z0-9]+", "-", hyper) | 1465 hyper <- gsub("[^a-z0-9]+", "-", hyper) |
| 373 hyper <- gsub("[-]+", "-", hyper) | 1466 hyper <- gsub("[-]+", "-", hyper) |
| 1467 hyper <- gsub("[_]+", "-", hyper) | |
| 374 hyper <- sub("^[-]", "", hyper) | 1468 hyper <- sub("^[-]", "", hyper) |
| 375 hyper <- sub("[-]$", "", hyper) | 1469 hyper <- sub("[-]$", "", hyper) |
| 376 return(hyper) | 1470 return(hyper) |
| 377 } | 1471 } |
| 378 | 1472 |
| 379 subsection_header <- | 1473 table_href <- function(s = "offset", caption = "") { |
| 380 function(s) { | 1474 paste0("\\hyperlink{table.\\arabic{", s, "}}{Table \\arabic{", s, "}}") |
| 1475 } | |
| 1476 | |
| 1477 table_offset <- function(i = 0, s = "offset", new = FALSE) { | |
| 1478 paste0( | |
| 1479 if (new) paste0("\\newcounter{", s, "}\n") else "", | |
| 1480 "\\setcounter{", s, "}{\\value{table}}\n", | |
| 1481 paste0(if (i > 0) rep(paste0("\\stepcounter{", s, "}"), i), "\n") | |
| 1482 ) | |
| 1483 } | |
| 1484 | |
| 1485 a_section_header <- | |
| 1486 function(s, prefix = "") { | |
| 381 hyper <- hypersub(s) | 1487 hyper <- hypersub(s) |
| 382 cat( | 1488 my_subsection_header <- sprintf( |
| 383 sprintf( | 1489 "\\hypertarget{%s}{\\%ssection{%s}\\label{%s}}\n", |
| 384 "\\hypertarget{%s}\n{\\subsection{%s}\\label{%s}}\n", | 1490 hyper, |
| 385 hyper, s, hyper | 1491 prefix, |
| 386 ) | 1492 gsub("_", "\\_", s, fixed = TRUE), |
| 387 ) | 1493 hyper |
| 388 } | 1494 ) |
| 389 | 1495 my_subsection_header |
| 390 subsubsection_header <- | 1496 } |
| 391 function(s) { | 1497 section_header <- function(s) a_section_header(s, "") |
| 392 hyper <- hypersub(s) | 1498 subsection_header <- function(s) a_section_header(s, "sub") |
| 393 cat( | 1499 subsubsection_header <- function(s) a_section_header(s, "subsub") |
| 394 sprintf( | |
| 395 "\\hypertarget{%s}\n{\\subsubsection{%s}\\label{%s}}\n", | |
| 396 hyper, s, hyper | |
| 397 ) | |
| 398 ) | |
| 399 } | |
| 400 | 1500 |
| 401 ### SQLite functions | 1501 ### SQLite functions |
| 402 | 1502 |
| 403 ddl_exec <- function(db, sql) { | 1503 ddl_exec <- function(db, sql) { |
| 404 discard <- DBI::dbExecute(conn = db, statement = sql) | 1504 discard <- DBI::dbExecute(conn = db, statement = sql) |
| 432 } | 1532 } |
| 433 } | 1533 } |
| 434 | 1534 |
| 435 ### KSEA functions and helpers | 1535 ### KSEA functions and helpers |
| 436 | 1536 |
| 437 # Adapted from KSEAapp::KSEA.Scores to allow retrieval of: | 1537 #' The KSEA App Analysis (KSEA Kinase Scores Only) |
| 438 # - maximum log2(FC) | 1538 #' |
| 1539 #' Compute KSEA kinase scores and statistics from phoshoproteomics data input | |
| 1540 #' Adapted from KSEAapp::KSEA.Scores to allow retrieval of maximum log2(FC) | |
| 1541 #' | |
| 1542 #' Result is an R data.frame with column names | |
| 1543 #' "Kinase.Gene", "mS", "Enrichment", "m", "z.score", "p.value", "FDR" | |
| 1544 #' "Please refer to the original Casado et al. publication for detailed | |
| 1545 #' description of these columns and what they represent: | |
| 1546 #' | |
| 1547 #' - Kinase.Gene indicates the gene name for each kinase. | |
| 1548 #' - mS represents the mean log2(fold change) of all the | |
| 1549 #' kinase's substrates. | |
| 1550 #' - Enrichment is the background-adjusted value of the kinase's mS. | |
| 1551 #' - m is the total number of detected substrates | |
| 1552 #' from the experimental dataset for each kinase. | |
| 1553 #' - z.score is the normalized score for each kinase, weighted by | |
| 1554 #' the number of identified substrates. | |
| 1555 #' - p.value represents the statistical assessment for the z.score. | |
| 1556 #' - FDR is the p-value adjusted for multiple hypothesis testing | |
| 1557 #' using the Benjamini & Hochberg method." | |
| 1558 #' | |
| 1559 #' @param ksdata the Kinase-Substrate dataset uploaded from the file | |
| 1560 #' prefaced with "PSP&NetworKIN_" | |
| 1561 #' available from github.com/casecpb/KSEA/ | |
| 1562 #' @param px the experimental data file formatted as described in | |
| 1563 #' the KSEA.Complete() documentation | |
| 1564 #' @param networkin a binary input of TRUE or FALSE, indicating whether | |
| 1565 #' or not to include NetworKIN predictions, where | |
| 1566 #' \code{NetworKIN = TRUE} | |
| 1567 #' means include NetworKIN predictions | |
| 1568 #' @param networkin_cutoff a numeric value between 1 and infinity setting | |
| 1569 #' the minimum NetworKIN score | |
| 1570 #' (this can be omitted if NetworKIN = FALSE) | |
| 1571 #' | |
| 1572 #' @return creates a new R data.frame with all the KSEA kinase | |
| 1573 #' scores, along with each one's statistical | |
| 1574 #' assessment, as described herein. | |
| 1575 #' | |
| 1576 #' @references | |
| 1577 #' | |
| 1578 #' Casado et al. (2013) Sci Signal. 6(268):rs6 | |
| 1579 #' | |
| 1580 #' Hornbeck et al. (2015) Nucleic Acids Res. 43:D512-20 | |
| 1581 #' | |
| 1582 #' Horn et al. (2014) Nature Methods 11(6):603-4 | |
| 1583 #' | |
| 439 ksea_scores <- function( | 1584 ksea_scores <- function( |
| 440 | |
| 441 # For human data, typically, ksdata = KSEAapp::ksdata | 1585 # For human data, typically, ksdata = KSEAapp::ksdata |
| 442 ksdata, | 1586 ksdata, |
| 443 | 1587 |
| 444 # Input data file having columns: | 1588 # Input data file having columns: |
| 445 # - Protein : abbreviated protein name | 1589 # - Protein : abbreviated protein name |
| 457 # NetworKIN predictions | 1601 # NetworKIN predictions |
| 458 networkin, | 1602 networkin, |
| 459 | 1603 |
| 460 # A numeric value between 1 and infinity setting the minimum NetworKIN | 1604 # A numeric value between 1 and infinity setting the minimum NetworKIN |
| 461 # score (can be left out if networkin = FALSE) | 1605 # score (can be left out if networkin = FALSE) |
| 462 networkin_cutoff | 1606 networkin_cutoff, |
| 1607 | |
| 1608 # Minimum substrate count, necessary to adjust the p-value appropriately. | |
| 1609 minimum_substrate_count | |
| 463 | 1610 |
| 464 ) { | 1611 ) { |
| 1612 # no px$FC should be <= 0, but abs(px$FC) is used below as a precaution. | |
| 465 if (length(grep(";", px$Residue.Both)) == 0) { | 1613 if (length(grep(";", px$Residue.Both)) == 0) { |
| 466 # There are no Residue.Both entries having semicolons, so new is | 1614 # There are no Residue.Both entries having semicolons, so new is |
| 467 # simply px except two columns are renamed and a column is added | 1615 # simply px except two columns are renamed and a column is added |
| 468 # for log2(abs(fold-change)) | 1616 # for log2(abs(fold-change)) |
| 469 new <- px | 1617 new <- px |
| 505 # Convert any illegal values from NaN to NA | 1653 # Convert any illegal values from NaN to NA |
| 506 new[is.nan(new$log2_fc), "log2_fc"] <- NA | 1654 new[is.nan(new$log2_fc), "log2_fc"] <- NA |
| 507 # Eliminate rows having missing values (e.g., non-imputed data) | 1655 # Eliminate rows having missing values (e.g., non-imputed data) |
| 508 new <- new[complete.cases(new$log2_fc), ] | 1656 new <- new[complete.cases(new$log2_fc), ] |
| 509 } | 1657 } |
| 510 if (networkin == TRUE) { | 1658 # At this point, new$log2_fc is signed according to which contrast has |
| 511 # When NetworKIN is true, filter on NetworKIN.cutoff which includes | 1659 # the greater intensity |
| 512 # PhosphoSitePlus data *because its networkin_score is set to Inf* | 1660 # To take the magnitude into account without taking the direction into |
| 513 ksdata_filtered <- ksdata[grep("[a-z]", ksdata$Source), ] | 1661 # account, set params$kseaUseAbsoluteLog2FC to TRUE |
| 514 ksdata_filtered <- ksdata_filtered[ | 1662 # |
| 515 (ksdata_filtered$networkin_score >= networkin_cutoff), ] | 1663 # Should KSEA be performed aggregating signed log2FC or absolute? |
| 516 } else { | 1664 # FALSE use raw log2FC for KSEA as for KSEAapp::KSEA.Scores |
| 517 # Otherwise, simply use PhosphSitePlus rows | 1665 if (params$kseaUseAbsoluteLog2FC) { |
| 518 ksdata_filtered <- ksdata[ | 1666 # TRUE use abs(log2FC) for KSEA as Justin requested; this is a |
| 519 grep("PhosphoSitePlus", ksdata$Source), ] | 1667 # justifiable deviation from the KSEAapp::KSEA.Scores algorithm. |
| 520 } | 1668 new$log2_fc <- abs(new$log2_fc) |
| 521 # Join the two data.frames on common columns SUB_GENE and SUB_MOD_RSD | 1669 } |
| 1670 | |
| 1671 monitor_filtration_on_stderr <- TRUE | |
| 1672 if (monitor_filtration_on_stderr) { | |
| 1673 # set to TRUE to monitor filtration on stderr | |
| 1674 sink(stderr()) | |
| 1675 cat(see_variable(networkin, "\n")) | |
| 1676 } | |
| 1677 ksdata_filtered <- | |
| 1678 sqldf( | |
| 1679 sprintf("%s %s", | |
| 1680 "select * from ksdata where not Source = 'NetworKIN'", | |
| 1681 if (networkin) | |
| 1682 sprintf("or networkin_score >= %d", networkin_cutoff) | |
| 1683 else | |
| 1684 "" | |
| 1685 ) | |
| 1686 ) | |
| 1687 if (monitor_filtration_on_stderr) { | |
| 1688 cat(see_variable(sqldf( | |
| 1689 "select count(*), Source from ksdata group by Source"), "\n")) | |
| 1690 cat(see_variable(sqldf( | |
| 1691 "select count(*), Source from ksdata_filtered group by Source"), "\n")) | |
| 1692 sink() | |
| 1693 } | |
| 1694 | |
| 1695 ############################################################################ | |
| 1696 # Line numbers below refer to lines of: | |
| 1697 # https://github.com/casecpb/KSEAapp/blob/master/R/KSEA.Scores.R | |
| 1698 # I would put the original line in a comment but then lint would complain... | |
| 1699 # - Indeed, I had to rename all the variables because lint didn't like names | |
| 1700 # containing periods or capital letters. | |
| 1701 # ACE | |
| 1702 ############################################################################ | |
| 1703 # | |
| 1704 # (1) Join the two data.frames on common columns SUB_GENE and SUB_MOD_RSD | |
| 522 # colnames of ksdata_filtered: | 1705 # colnames of ksdata_filtered: |
| 523 # "KINASE" "KIN_ACC_ID" "GENE" "KIN_ORGANISM" "SUBSTRATE" "SUB_GENE_ID" | 1706 # "KINASE" "KIN_ACC_ID" "GENE" "KIN_ORGANISM" "SUBSTRATE" "SUB_GENE_ID" |
| 524 # "SUB_ACC_ID" "SUB_GENE" "SUB_ORGANISM" "SUB_MOD_RSD" "SITE_GRP_ID" | 1707 # "SUB_ACC_ID" "SUB_GENE" "SUB_ORGANISM" "SUB_MOD_RSD" "SITE_GRP_ID" |
| 525 # "SITE_...7_AA" "networkin_score" "Source" | 1708 # "SITE_...7_AA" "networkin_score" "Source" |
| 526 # colnames of new: | 1709 # colnames of new: |
| 529 # SELECT a.*. b.Protein, b.Peptide, b.p, b.FC, b.log2_fc | 1712 # SELECT a.*. b.Protein, b.Peptide, b.p, b.FC, b.log2_fc |
| 530 # FROM ksdata_filtered a | 1713 # FROM ksdata_filtered a |
| 531 # INNER JOIN new b | 1714 # INNER JOIN new b |
| 532 # ON a.SUB_GENE = b.SUB_GENE | 1715 # ON a.SUB_GENE = b.SUB_GENE |
| 533 # AND a.SUB_MOD_RSD = b.SUB_MOD_RSD | 1716 # AND a.SUB_MOD_RSD = b.SUB_MOD_RSD |
| 1717 # (KSEA.Scores.R line # 105) | |
| 1718 # "Extract KSData.filtered annotations that are only found in new" | |
| 534 ksdata_dataset <- base::merge(ksdata_filtered, new) | 1719 ksdata_dataset <- base::merge(ksdata_filtered, new) |
| 535 # colnames of ksdata_dataset: | 1720 # colnames of ksdata_dataset: |
| 536 # "KINASE" "KIN_ACC_ID" "GENE" "KIN_ORGANISM" "SUBSTRATE" | 1721 # "KINASE" "KIN_ACC_ID" "GENE" "KIN_ORGANISM" "SUBSTRATE" |
| 537 # "SUB_GENE_ID" "SUB_ACC_ID" "SUB_GENE" "SUB_ORGANISM" "SUB_MOD_RSD" | 1722 # "SUB_GENE_ID" "SUB_ACC_ID" "SUB_GENE" "SUB_ORGANISM" "SUB_MOD_RSD" |
| 538 # "SITE_GRP_ID" "SITE_...7_AA" "networkin_score" "Source" "Protein" | 1723 # "SITE_GRP_ID" "SITE_...7_AA" "networkin_score" "Source" "Protein" |
| 539 # "Peptide" "p" "FC" "log2_fc" (uniprot_no_isoform) | 1724 # "Peptide" "p" "FC" "log2_fc" (uniprot_no_isoform) |
| 540 # Re-order dataset; prior to accounting for isoforms | 1725 # Re-order dataset; prior to accounting for isoforms |
| 1726 # (KSEA.Scores.R line # 106) | |
| 541 ksdata_dataset <- ksdata_dataset[order(ksdata_dataset$GENE), ] | 1727 ksdata_dataset <- ksdata_dataset[order(ksdata_dataset$GENE), ] |
| 542 # Extract non-isoform accession in UniProtKB | 1728 # Extract non-isoform accession in UniProtKB |
| 1729 # (KSEA.Scores.R line # 107) | |
| 543 ksdata_dataset$uniprot_no_isoform <- sapply( | 1730 ksdata_dataset$uniprot_no_isoform <- sapply( |
| 544 ksdata_dataset$KIN_ACC_ID, | 1731 ksdata_dataset$KIN_ACC_ID, |
| 545 function(x) unlist(strsplit(as.character(x), split = "-"))[1] | 1732 function(x) unlist(strsplit(as.character(x), split = "-"))[1] |
| 546 ) | 1733 ) |
| 1734 # "last expression collapses isoforms ... for easy processing" | |
| 547 # Discard previous results while selecting interesting columns ... | 1735 # Discard previous results while selecting interesting columns ... |
| 1736 # (KSEA.Scores.R line # 110) | |
| 548 ksdata_dataset_abbrev <- ksdata_dataset[, c(5, 1, 2, 16:19, 14)] | 1737 ksdata_dataset_abbrev <- ksdata_dataset[, c(5, 1, 2, 16:19, 14)] |
| 549 # Column names are now: | 1738 # Column names are now: |
| 550 # "GENE" "SUB_GENE" "SUB_MOD_RSD" "Peptide" "p" | 1739 # "GENE" "SUB_GENE" "SUB_MOD_RSD" "Peptide" "p" |
| 551 # "FC" "log2_fc" "Source" | 1740 # "FC" "log2_fc" "Source" |
| 552 # Make column names human-readable | 1741 # Make column names human-readable |
| 1742 # (KSEA.Scores.R line # 111) | |
| 553 colnames(ksdata_dataset_abbrev) <- c( | 1743 colnames(ksdata_dataset_abbrev) <- c( |
| 554 "Kinase.Gene", "Substrate.Gene", "Substrate.Mod", "Peptide", "p", | 1744 "Kinase.Gene", "Substrate.Gene", "Substrate.Mod", "Peptide", "p", |
| 555 "FC", "log2FC", "Source" | 1745 "FC", "log2FC", "Source" |
| 556 ) | 1746 ) |
| 557 # SELECT * FROM ksdata_dataset_abbrev | 1747 # SELECT * FROM ksdata_dataset_abbrev |
| 558 # ORDER BY Kinase.Gene, Substrate.Gene, Substrate.Mod, p | 1748 # ORDER BY Kinase.Gene, Substrate.Gene, Substrate.Mod, p |
| 1749 # (KSEA.Scores.R line # 112) | |
| 1750 # "Extract KSData.filtered annotations that are only found in new" | |
| 559 ksdata_dataset_abbrev <- | 1751 ksdata_dataset_abbrev <- |
| 560 ksdata_dataset_abbrev[ | 1752 ksdata_dataset_abbrev[ |
| 561 order( | 1753 order( |
| 562 ksdata_dataset_abbrev$Kinase.Gene, | 1754 ksdata_dataset_abbrev$Kinase.Gene, |
| 563 ksdata_dataset_abbrev$Substrate.Gene, | 1755 ksdata_dataset_abbrev$Substrate.Gene, |
| 564 ksdata_dataset_abbrev$Substrate.Mod, | 1756 ksdata_dataset_abbrev$Substrate.Mod, |
| 565 ksdata_dataset_abbrev$p), | 1757 ksdata_dataset_abbrev$p), |
| 566 ] | 1758 ] |
| 1759 if (print_nb_messages) nbe(see_variable(ksdata_dataset_abbrev)) | |
| 567 # First aggregation step to account for multiply phosphorylated peptides | 1760 # First aggregation step to account for multiply phosphorylated peptides |
| 568 # and differing peptide sequences; the goal here is to combine results | 1761 # and differing peptide sequences; the goal here is to combine results |
| 569 # for all measurements of the same substrate. | 1762 # for all measurements of the same substrate. |
| 570 # SELECT `Kinase.Gene`, `Substrate.Gene`, `Substrate.Mod`, | 1763 # SELECT `Kinase.Gene`, `Substrate.Gene`, `Substrate.Mod`, |
| 571 # `Source`, avg(log2FC) AS log2FC | 1764 # `Source`, avg(log2FC) AS log2FC |
| 573 # GROUP BY `Kinase.Gene`, `Substrate.Gene`, `Substrate.Mod`, | 1766 # GROUP BY `Kinase.Gene`, `Substrate.Gene`, `Substrate.Mod`, |
| 574 # `Source` | 1767 # `Source` |
| 575 # ORDER BY `Kinase.Gene`; | 1768 # ORDER BY `Kinase.Gene`; |
| 576 # in two steps: | 1769 # in two steps: |
| 577 # (1) compute average log_2(fold-change) | 1770 # (1) compute average log_2(fold-change) |
| 1771 # "take the mean of the log2FC amongst phosphosite duplicates" | |
| 1772 # (KSEA.Scores.R line # 115) | |
| 578 ksdata_dataset_abbrev <- aggregate( | 1773 ksdata_dataset_abbrev <- aggregate( |
| 579 log2FC ~ Kinase.Gene + Substrate.Gene + Substrate.Mod + Source, | 1774 log2FC ~ Kinase.Gene + Substrate.Gene + Substrate.Mod + Source, |
| 580 data = ksdata_dataset_abbrev, | 1775 data = ksdata_dataset_abbrev, |
| 581 FUN = mean | 1776 FUN = mean |
| 582 ) | 1777 ) |
| 1778 if (print_nb_messages) nbe(see_variable(ksdata_dataset_abbrev)) | |
| 583 # (2) order by Kinase.Gene | 1779 # (2) order by Kinase.Gene |
| 1780 # (KSEA.Scores.R line # 117) | |
| 584 ksdata_dataset_abbrev <- | 1781 ksdata_dataset_abbrev <- |
| 585 ksdata_dataset_abbrev[order(ksdata_dataset_abbrev$Kinase.Gene), ] | 1782 ksdata_dataset_abbrev[order(ksdata_dataset_abbrev$Kinase.Gene), ] |
| 586 # SELECT `Kinase.Gene`, count(*) | 1783 # SELECT `Kinase.Gene`, count(*) |
| 587 # FROM ksdata_dataset_abbrev | 1784 # FROM ksdata_dataset_abbrev |
| 588 # GROUP BY `Kinase.Gene`; | 1785 # GROUP BY `Kinase.Gene`; |
| 589 # in two steps: | 1786 # in two steps: |
| 590 # (1) Extract the list of Kinase.Gene names | 1787 # (1) Extract the list of Kinase.Gene names |
| 1788 # "@@@@@@@@@@@@@@@@@@@@" | |
| 1789 # "Do analysis for KSEA" | |
| 1790 # "@@@@@@@@@@@@@@@@@@@@" | |
| 1791 # (KSEA.Scores.R line # 124) | |
| 591 kinase_list <- as.vector(ksdata_dataset_abbrev$Kinase.Gene) | 1792 kinase_list <- as.vector(ksdata_dataset_abbrev$Kinase.Gene) |
| 592 # (2) Convert to a named list of counts of kinases in ksdata_dataset_abrev, | 1793 # (2) Convert to a named list of counts of kinases in ksdata_dataset_abrev, |
| 593 # named by Kinase.Gene | 1794 # named by Kinase.Gene |
| 1795 # (KSEA.Scores.R line # 125) | |
| 594 kinase_list <- as.matrix(table(kinase_list)) | 1796 kinase_list <- as.matrix(table(kinase_list)) |
| 595 # Second aggregation step to account for all substrates per kinase | 1797 # Second aggregation step to account for all substrates per kinase |
| 596 # CREATE TABLE mean_fc | 1798 # CREATE TABLE mean_fc |
| 597 # AS | 1799 # AS |
| 598 # SELECT `Kinase.Gene`, avg(log2FC) AS log2FC | 1800 # SELECT `Kinase.Gene`, avg(log2FC) AS log2FC |
| 599 # FROM ksdata_dataset_abbrev | 1801 # FROM ksdata_dataset_abbrev |
| 600 # GROUP BY `Kinase.Gene` | 1802 # GROUP BY `Kinase.Gene` |
| 601 mean_fc <- aggregate( | 1803 # (KSEA.Scores.R line # 127) |
| 602 log2FC ~ Kinase.Gene, | 1804 if (print_nb_messages) nb(see_variable(ksdata_dataset_abbrev), "\n") |
| 603 data = ksdata_dataset_abbrev, | 1805 mean_fc <- |
| 604 FUN = mean | 1806 aggregate( |
| 605 ) | |
| 606 # mean_fc columns: "Kinase.Gene", "log2FC" | |
| 607 if (FALSE) { | |
| 608 # I need to re-think this; I was trying to find the most-represented | |
| 609 # peptide, but that horse has already left the barn | |
| 610 # SELECT `Kinase.Gene`, max(abs(log2FC)) AS log2FC | |
| 611 # FROM ksdata_dataset_abbrev | |
| 612 # GROUP BY `Kinase.Gene` | |
| 613 max_fc <- aggregate( | |
| 614 log2FC ~ Kinase.Gene, | 1807 log2FC ~ Kinase.Gene, |
| 615 data = ksdata_dataset_abbrev, | 1808 data = ksdata_dataset_abbrev, |
| 616 FUN = function(r) max(abs(r)) | 1809 FUN = mean |
| 617 ) | 1810 ) |
| 618 } | 1811 if (print_nb_messages) nbe(see_variable(mean_fc), "\n") |
| 1812 | |
| 1813 # for contrast j | |
| 1814 # for each kinase i | |
| 1815 # extract log2 of fold-change (from `new` above) | |
| 1816 # (used in KSEA.Scores.R lines # 130 & 132) | |
| 1817 log2_fc_j_each_i <- | |
| 1818 new$log2_fc | |
| 1819 | |
| 1820 # for contrast j | |
| 1821 # for all kinases i | |
| 1822 # compute mean of abs(log2 of fold-change) | |
| 1823 # (used in KSEA.Scores.R lines # 130) | |
| 1824 mean_abs_log2_fc_j_all_i <- | |
| 1825 mean(abs(log2_fc_j_each_i), na.rm = TRUE) | |
| 1826 | |
| 1827 # for contrast j | |
| 1828 # for all kinases i | |
| 1829 # compute mean of log2 of fold-change | |
| 1830 # (used in KSEA.Scores.R lines # 132) | |
| 1831 mean_log2_fc_j_all_i <- | |
| 1832 mean(log2_fc_j_each_i, na.rm = TRUE) | |
| 1833 | |
| 1834 # Reorder mean_fc, although I don't see why | |
| 1835 # (KSEA.Scores.R line 128 | |
| 1836 mean_fc <- mean_fc[order(mean_fc[, 1]), ] | |
| 1837 # mean_fc columns so far: "Kinase.Gene", "log2FC" | |
| 1838 # - Kinase.Gene | |
| 1839 # indicates the gene name for each kinase. | |
| 619 | 1840 |
| 620 # Create column 3: mS | 1841 # Create column 3: mS |
| 621 mean_fc$m_s <- mean_fc[, 2] | 1842 # - mS |
| 1843 # represents the mean log2(fold change) of all the | |
| 1844 # kinase's substrates. | |
| 1845 # (KSEA.Scores.R line # 129) | |
| 1846 mean_fc$m_s <- | |
| 1847 mean_fc_m_s <- mean_fc[, 2] | |
| 1848 | |
| 622 # Create column 4: Enrichment | 1849 # Create column 4: Enrichment |
| 623 mean_fc$enrichment <- mean_fc$m_s / abs(mean(new$log2_fc, na.rm = TRUE)) | 1850 # - Enrichment |
| 624 # Create column 5: m, count of substrates | 1851 # is the background-adjusted value of the kinase's mS. |
| 625 mean_fc$m <- kinase_list | 1852 # (KSEA.Scores.R line # 130) |
| 1853 mean_fc$enrichment <- | |
| 1854 mean_fc_m_s / mean_abs_log2_fc_j_all_i | |
| 1855 | |
| 1856 # Create column 5: m, count of substrates of kinase (count of j for i) | |
| 1857 # - m | |
| 1858 # is the total number of detected substrates | |
| 1859 # from the experimental dataset for each kinase. | |
| 1860 # (KSEA.Scores.R line # 131) | |
| 1861 mean_fc$m <- | |
| 1862 mean_fc_m <- kinase_list | |
| 1863 | |
| 1864 | |
| 626 # Create column 6: z-score | 1865 # Create column 6: z-score |
| 627 mean_fc$z_score <- ( | 1866 # - z.score |
| 628 (mean_fc$m_s - mean(new$log2_fc, na.rm = TRUE)) * | 1867 # is the normalized score for each kinase, weighted by |
| 629 sqrt(mean_fc$m)) / sd(new$log2_fc, na.rm = TRUE) | 1868 # the number of identified substrates. |
| 1869 # (KSEA.Scores.R line # 132) | |
| 1870 mean_fc$z_score <- | |
| 1871 (mean_fc_m_s - mean_log2_fc_j_all_i) * sqrt(mean_fc_m) / | |
| 1872 sd(log2_fc_j_each_i, na.rm = TRUE) | |
| 1873 | |
| 630 # Create column 7: p-value, deduced from z-score | 1874 # Create column 7: p-value, deduced from z-score |
| 631 mean_fc$p_value <- pnorm(-abs(mean_fc$z_score)) | 1875 # - p.value |
| 1876 # represents the statistical assessment for the z.score. | |
| 1877 # (KSEA.Scores.R line # 133) | |
| 1878 # "one-tailed p-value" | |
| 1879 mean_fc$p_value <- | |
| 1880 pnorm(-abs(mean_fc$z_score)) | |
| 1881 | |
| 1882 # zap excluded kinases; this must be done before adjusting p-value | |
| 1883 if (TRUE) { | |
| 1884 mean_fc <- | |
| 1885 mean_fc[ | |
| 1886 mean_fc$m >= minimum_substrate_count, | |
| 1887 , | |
| 1888 drop = FALSE | |
| 1889 ] | |
| 1890 } | |
| 1891 | |
| 1892 #ACE nb(see_variable(nrow(mean_fc)), "\n") | |
| 632 # Create column 8: FDR, deduced by Benjamini-Hochberg adustment from p-value | 1893 # Create column 8: FDR, deduced by Benjamini-Hochberg adustment from p-value |
| 633 mean_fc$fdr <- p.adjust(mean_fc$p_value, method = "fdr") | 1894 # - FDR |
| 634 | 1895 # is the p-value adjusted for multiple hypothesis testing |
| 635 # Remove log2FC column, which is duplicated as mS | 1896 # using the Benjamini & Hochberg method." |
| 636 mean_fc <- mean_fc[order(mean_fc$Kinase.Gene), -2] | 1897 # (KSEA.Scores.R line # 134) |
| 1898 mean_fc$fdr <- | |
| 1899 p.adjust(mean_fc$p_value, method = "fdr") | |
| 1900 | |
| 1901 # It makes no sense to leave Z-scores negative when using | |
| 1902 # absolute value of fold-change | |
| 1903 if (params$kseaUseAbsoluteLog2FC) { | |
| 1904 mean_fc$z_score <- abs(mean_fc$z_score) | |
| 1905 } | |
| 1906 | |
| 1907 # Remove second column (log2FC), which is duplicated as mS | |
| 1908 # (KSEA.Scores.R line # 136) | |
| 1909 mean_fc <- | |
| 1910 mean_fc[order(mean_fc$Kinase.Gene), -2] | |
| 637 # Correct the column names which we had to hack because of the linter... | 1911 # Correct the column names which we had to hack because of the linter... |
| 638 colnames(mean_fc) <- c( | 1912 colnames(mean_fc) <- c( |
| 639 "Kinase.Gene", "mS", "Enrichment", "m", "z.score", "p.value", "FDR" | 1913 "Kinase.Gene", "mS", "Enrichment", "m", "z.score", "p.value", "FDR" |
| 640 ) | 1914 ) |
| 1915 # (KSEA.Scores.R line # 138) | |
| 641 return(mean_fc) | 1916 return(mean_fc) |
| 642 } | 1917 } |
| 643 | 1918 |
| 644 low_fdr_barplot <- function( | 1919 ksea_low_fdr_barplot_factory <- function( |
| 645 rslt, | 1920 rslt, |
| 646 i_cntrst, | 1921 i_cntrst, |
| 647 i, | 1922 i, |
| 648 a_level, | 1923 a_level, |
| 649 b_level, | 1924 b_level, |
| 671 k$fdr | 1946 k$fdr |
| 672 }, | 1947 }, |
| 673 "p.value" = { | 1948 "p.value" = { |
| 674 k$p_value | 1949 k$p_value |
| 675 }, | 1950 }, |
| 676 stop( | 1951 { |
| 677 sprintf( | 1952 cat( |
| 678 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", | 1953 sprintf( |
| 679 ksea_cutoff_statistic | 1954 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", |
| 1955 ksea_cutoff_statistic | |
| 1956 ) | |
| 680 ) | 1957 ) |
| 681 ) | 1958 param_df_exit() |
| 682 ) | 1959 knitr::knit_exit() |
| 1960 } | |
| 1961 ) | |
| 683 | 1962 |
| 684 k <- k[selector < ksea_cutoff_threshold, ] | 1963 k <- k[selector < ksea_cutoff_threshold, ] |
| 685 | 1964 nrow_k <- nrow(k) |
| 686 if (nrow(k) > 0) { | 1965 |
| 687 op <- par(mai = c(1, 1.5, 0.4, 0.4)) | 1966 #ACE nbe(see_variable(fdr_barplot_dataframe <- k)) |
| 1967 | |
| 1968 if (nrow_k > 0) { | |
| 1969 max_nchar_rowname <- max(nchar(rownames(k))) | |
| 1970 my_cex_names <- 1.0 / (1 + nrow_k / 50) | |
| 1971 | |
| 1972 if (print_trace_messages) cat_margins("Initially") | |
| 1973 if (print_trace_messages) cat_variable(nrow_k, "\n\n", 0) | |
| 1974 if (print_trace_messages) cat_variable(my_cex_names, "\n\n", 0) | |
| 1975 if (print_trace_messages) cat_variable(max_nchar_rowname, "\n\n", 0) | |
| 1976 | |
| 1977 # fin: The figure region dimensions, (width, height), in inches. | |
| 1978 # mar: A numerical vector of the form c(bottom, left, top, right) | |
| 1979 # that gives the number of lines of margin to be specified | |
| 1980 # on the four sides of the plot; default: c(5, 4, 4, 2) + 0.1 | |
| 1981 | |
| 1982 # mar: The figure region dimensions, (width, height), in inches. | |
| 688 numeric_z_score <- as.numeric(k$z_score) | 1983 numeric_z_score <- as.numeric(k$z_score) |
| 689 z_score_order <- order(numeric_z_score) | 1984 bar_order <- order(-as.numeric(k$p_value)) |
| 690 kinase_name <- k$kinase_gene | 1985 kinase_name <- k$kinase_gene |
| 691 long_caption <- | 1986 long_caption <- |
| 692 sprintf( | 1987 sprintf( |
| 693 "Kinase z-score, %s < %s, %s", | 1988 "Kinase z-score, %s, KSEA %s < %s", |
| 1989 caption, | |
| 694 ksea_cutoff_statistic, | 1990 ksea_cutoff_statistic, |
| 695 ksea_cutoff_threshold, | 1991 ksea_cutoff_threshold |
| 696 caption | |
| 697 ) | 1992 ) |
| 698 my_cex_caption <- 65.0 / max(65.0, nchar(long_caption)) | 1993 my_cex_caption <- 65.0 / max(65.0, nchar(long_caption)) |
| 699 cat("\n\\clearpage\n") | 1994 # return a function that draws the plot |
| 700 barplot( | 1995 function() { |
| 701 height = numeric_z_score[z_score_order], | 1996 par_fin <- par("fin") # vector of width_in_inches and height_in_inches) |
| 702 border = NA, | 1997 op <- par( |
| 703 xpd = FALSE, | 1998 bg = if (print_trace_messages) "yellow" else "white", |
| 704 cex.names = 1.0, | 1999 fin = c(par_fin[1], min(par_fin[2], 2.5 + nrow_k / 6)), |
| 705 main = long_caption, | 2000 mar = par("mar") + |
| 706 cex.main = my_cex_caption, | 2001 c(3 / nrow_k, (1 + max_nchar_rowname * my_cex_names) / 2, 0, 0) |
| 707 names.arg = kinase_name[z_score_order], | 2002 # bottom, left, top, right |
| 708 horiz = TRUE, | |
| 709 srt = 45, | |
| 710 las = 1, | |
| 711 cex.axis = 0.9 | |
| 712 ) | 2003 ) |
| 713 par(op) | 2004 on.exit(par(op)) |
| 2005 if (print_trace_messages) cat_margins("Eventually") | |
| 2006 | |
| 2007 barplot( | |
| 2008 height = numeric_z_score[bar_order], | |
| 2009 border = NA, | |
| 2010 xpd = FALSE, | |
| 2011 cex.names = my_cex_names, | |
| 2012 main = long_caption, | |
| 2013 cex.main = my_cex_caption, | |
| 2014 names.arg = kinase_name[bar_order], | |
| 2015 horiz = TRUE, | |
| 2016 srt = 45, | |
| 2017 las = 1, | |
| 2018 cex.axis = 0.9 | |
| 2019 ) | |
| 2020 } | |
| 714 } | 2021 } |
| 2022 } else { | |
| 2023 no_op | |
| 715 } | 2024 } |
| 716 } | 2025 } |
| 717 | 2026 |
| 718 # note that this adds elements to the global variable `ksea_asterisk_hash` | 2027 # note that this adds elements to the global variable `ksea_asterisk_hash` |
| 719 | 2028 |
| 720 low_fdr_print <- function( | 2029 ksea_low_fdr_print <- function( |
| 721 rslt, | 2030 rslt, |
| 722 i_cntrst, | 2031 i_cntrst, |
| 723 i, | 2032 i, |
| 724 a_level, | 2033 a_level, |
| 725 b_level, | 2034 b_level, |
| 726 fold_change, | 2035 fold_change, |
| 727 caption | 2036 caption, |
| 2037 write_db = TRUE, # if TRUE, write to DB, else print table | |
| 2038 anchor = c(const_table_anchor_p, const_table_anchor_t) | |
| 728 ) { | 2039 ) { |
| 729 rslt_score_list_i <- rslt$score_list[[i]] | 2040 rslt_score_list_i <- rslt$score_list[[i]] |
| 730 if (!is.null(rslt_score_list_i)) { | 2041 if (!is.null(rslt_score_list_i)) { |
| 731 rslt_score_list_i_nrow <- nrow(rslt_score_list_i) | 2042 rslt_score_list_i_nrow <- nrow(rslt_score_list_i) |
| 732 k <- contrast_ksea_scores <- data.frame( | 2043 k <- contrast_ksea_scores <- data.frame( |
| 748 k$fdr | 2059 k$fdr |
| 749 }, | 2060 }, |
| 750 "p.value" = { | 2061 "p.value" = { |
| 751 k$p_value | 2062 k$p_value |
| 752 }, | 2063 }, |
| 753 stop( | 2064 { |
| 754 sprintf( | 2065 cat( |
| 755 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", | 2066 sprintf( |
| 756 ksea_cutoff_statistic | 2067 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", |
| 2068 ksea_cutoff_statistic | |
| 2069 ) | |
| 757 ) | 2070 ) |
| 758 ) | 2071 param_df_exit() |
| 759 ) | 2072 knitr::knit_exit() |
| 2073 } | |
| 2074 ) | |
| 760 | 2075 |
| 761 k <- k[selector < ksea_cutoff_threshold, ] | 2076 k <- k[selector < ksea_cutoff_threshold, ] |
| 762 # save kinase names to ksea_asterisk_hash | 2077 # save kinase names to ksea_asterisk_hash |
| 763 for (kinase_name in k$kinase_gene) { | 2078 for (kinase_name in k$kinase_gene) { |
| 764 ksea_asterisk_hash[[kinase_name]] <- 1 | 2079 ksea_asterisk_hash[[kinase_name]] <- 1 |
| 765 } | 2080 } |
| 766 | 2081 |
| 767 db_write_table_overwrite <- (i_cntrst < 2) | 2082 if (write_db) { |
| 768 db_write_table_append <- !db_write_table_overwrite | 2083 db_write_table_overwrite <- (i_cntrst < 2) |
| 769 RSQLite::dbWriteTable( | 2084 db_write_table_append <- !db_write_table_overwrite |
| 770 conn = db, | 2085 RSQLite::dbWriteTable( |
| 771 name = "contrast_ksea_scores", | 2086 conn = db, |
| 772 value = contrast_ksea_scores, | 2087 name = "contrast_ksea_scores", |
| 773 append = db_write_table_append | 2088 value = contrast_ksea_scores, |
| 774 ) | 2089 append = db_write_table_append |
| 775 selector <- switch( | 2090 ) |
| 776 ksea_cutoff_statistic, | 2091 "" |
| 777 "FDR" = { | 2092 } else { |
| 778 contrast_ksea_scores$fdr | 2093 selector <- switch( |
| 779 }, | 2094 ksea_cutoff_statistic, |
| 780 "p.value" = { | 2095 "FDR" = { |
| 781 contrast_ksea_scores$p_value | 2096 contrast_ksea_scores$fdr |
| 782 }, | 2097 }, |
| 783 stop( | 2098 "p.value" = { |
| 784 sprintf( | 2099 contrast_ksea_scores$p_value |
| 785 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", | 2100 }, |
| 786 ksea_cutoff_statistic | 2101 { |
| 2102 cat( | |
| 2103 sprintf( | |
| 2104 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", | |
| 2105 ksea_cutoff_statistic | |
| 2106 ) | |
| 2107 ) | |
| 2108 param_df_exit() | |
| 2109 knitr::knit_exit() | |
| 2110 } | |
| 2111 ) | |
| 2112 if (print_nb_messages) nbe(see_variable(contrast_ksea_scores)) | |
| 2113 output_df <- contrast_ksea_scores[ | |
| 2114 selector < ksea_cutoff_threshold, | |
| 2115 c("kinase_gene", "mean_log2_fc", "enrichment", "substrate_count", | |
| 2116 "z_score", "p_value", "fdr") | |
| 2117 ] | |
| 2118 output_df$kinase_gene <- | |
| 2119 gsub( | |
| 2120 "_", | |
| 2121 "\\\\_", | |
| 2122 output_df$kinase_gene | |
| 2123 ) | |
| 2124 colnames(output_df) <- | |
| 2125 c( | |
| 2126 colnames(output_df)[1], | |
| 2127 colnames(output_df)[2], | |
| 2128 "enrichment", | |
| 2129 "m_s", | |
| 2130 "z_score", | |
| 2131 "p_value", | |
| 2132 "fdr" | |
| 2133 ) | |
| 2134 #ACE output_order <- with(output_df, order(fdr)) | |
| 2135 output_order <- with(output_df, order(p_value)) | |
| 2136 output_df <- output_df[output_order, ] | |
| 2137 | |
| 2138 output_df[, 2] <- sprintf("%0.3g", output_df[, 2]) | |
| 2139 output_df$fdr <- sprintf("%0.4f", output_df$fdr) | |
| 2140 output_df$p_value <- sprintf("%0.2e", output_df$p_value) | |
| 2141 output_df$z_score <- sprintf("%0.2f", output_df$z_score) | |
| 2142 output_df$m_s <- sprintf("%d", output_df$m_s) | |
| 2143 output_df$enrichment <- sprintf("%0.3g", output_df$enrichment) | |
| 2144 output_ncol <- ncol(output_df) | |
| 2145 colnames(output_df) <- | |
| 2146 c( | |
| 2147 "Kinase", | |
| 2148 "\\(\\overline{{\\lvert}\\log_2 (\\text{fold-change}){\\rvert}}\\)", | |
| 2149 "Enrichment", | |
| 2150 "Substrates", | |
| 2151 "z-score", | |
| 2152 "p-value", | |
| 2153 "FDR" | |
| 2154 ) | |
| 2155 selector <- switch( | |
| 2156 ksea_cutoff_statistic, | |
| 2157 "FDR" = { | |
| 2158 rslt$score_list[[i]]$FDR | |
| 2159 }, | |
| 2160 "p.value" = { | |
| 2161 rslt$score_list[[i]]$p.value | |
| 2162 }, | |
| 2163 { | |
| 2164 cat( | |
| 2165 sprintf( | |
| 2166 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", | |
| 2167 ksea_cutoff_statistic | |
| 2168 ) | |
| 2169 ) | |
| 2170 param_df_exit() | |
| 2171 knitr::knit_exit() | |
| 2172 } | |
| 2173 ) | |
| 2174 if (sum(selector < ksea_cutoff_threshold) > 0) { | |
| 2175 if (print_nb_messages) nbe(see_variable(output_df)) | |
| 2176 math_caption <- gsub("{", "\\{", caption, fixed = TRUE) | |
| 2177 math_caption <- gsub("}", "\\}", math_caption, fixed = TRUE) | |
| 2178 # with ( | |
| 2179 # output_df, | |
| 2180 # ) | |
| 2181 if (TRUE) { | |
| 2182 output_df$Kinase <- whack_underscores(output_df$Kinase) | |
| 2183 data_frame_tabbing_latex( | |
| 2184 x = output_df, | |
| 2185 # vector of tab stops, in inches | |
| 2186 tabstops = c(1.0, 1.2, 1.0, 1.0, 1.0, 1.0), | |
| 2187 # vector of headings, registered with tab-stops | |
| 2188 headings = colnames(output_df), | |
| 2189 # digits to pass to format.data.frame | |
| 2190 digits = NULL, | |
| 2191 # maximumn number of rows to print | |
| 2192 max = NULL, | |
| 2193 # optional caption | |
| 2194 caption = sprintf( | |
| 2195 "\\text{%s}, KSEA %s < %s", | |
| 2196 math_caption, | |
| 2197 ksea_cutoff_statistic, | |
| 2198 ksea_cutoff_threshold | |
| 2199 ), | |
| 2200 # set underscore_whack to TRUE to escape underscores | |
| 2201 underscore_whack = FALSE, | |
| 2202 # flag for landscape mode | |
| 2203 landscape = FALSE, | |
| 2204 # flag indicating that subsubsection should be used for caption | |
| 2205 # rather than subsection | |
| 2206 use_subsubsection_header = TRUE, | |
| 2207 # character-size indicator; for possible values, see: | |
| 2208 # https://tug.org/texinfohtml/latex2e.html#Font-sizes | |
| 2209 charactersize = "small", | |
| 2210 # set verbatim to TRUE to debug output | |
| 2211 verbatim = FALSE | |
| 2212 ) | |
| 2213 } else { | |
| 2214 data_frame_table_latex( | |
| 2215 x = output_df, | |
| 2216 justification = "l c c c c c c", | |
| 2217 centered = TRUE, | |
| 2218 caption = sprintf( | |
| 2219 "\\text{%s}, KSEA %s < %s", | |
| 2220 math_caption, | |
| 2221 ksea_cutoff_statistic, | |
| 2222 ksea_cutoff_threshold | |
| 2223 ), | |
| 2224 anchor = anchor, | |
| 2225 underscore_whack = FALSE | |
| 2226 ) | |
| 2227 } | |
| 2228 } else { | |
| 2229 cat( | |
| 2230 sprintf( | |
| 2231 "\\break | |
| 2232 No kinases had | |
| 2233 \\(\\text{KSEA %s}_\\text{enrichment} < %s\\) | |
| 2234 for contrast %s\\hfill\\break\n", | |
| 2235 ksea_cutoff_statistic, | |
| 2236 ksea_cutoff_threshold, | |
| 2237 caption | |
| 787 ) | 2238 ) |
| 788 ) | 2239 ) |
| 789 ) | 2240 } |
| 790 output_df <- contrast_ksea_scores[ | |
| 791 selector < ksea_cutoff_threshold, | |
| 792 c("kinase_gene", "mean_log2_fc", "enrichment", "substrate_count", | |
| 793 "z_score", "p_value", "fdr") | |
| 794 ] | |
| 795 output_order <- with(output_df, order(mean_log2_fc, kinase_gene)) | |
| 796 output_df <- output_df[output_order, ] | |
| 797 colnames(output_df) <- | |
| 798 c( | |
| 799 colnames(output_df)[1], | |
| 800 colnames(output_df)[2], | |
| 801 "enrichment", | |
| 802 "m_s", | |
| 803 "z_score", | |
| 804 "p_value", | |
| 805 "fdr" | |
| 806 ) | |
| 807 output_df$fdr <- sprintf("%0.4f", output_df$fdr) | |
| 808 output_df$p_value <- sprintf("%0.2e", output_df$p_value) | |
| 809 output_df$z_score <- sprintf("%0.2f", output_df$z_score) | |
| 810 output_df$m_s <- sprintf("%d", output_df$m_s) | |
| 811 output_df$enrichment <- sprintf("%0.2f", output_df$enrichment) | |
| 812 output_ncol <- ncol(output_df) | |
| 813 colnames(output_df) <- | |
| 814 c( | |
| 815 "Kinase", | |
| 816 "\\(\\overline{\\log_2 (|\\text{fold-change}|)}\\)", | |
| 817 "Enrichment", | |
| 818 "Substrates", | |
| 819 "z-score", | |
| 820 "p-value", | |
| 821 "FDR" | |
| 822 ) | |
| 823 selector <- switch( | |
| 824 ksea_cutoff_statistic, | |
| 825 "FDR" = { | |
| 826 rslt$score_list[[i]]$FDR | |
| 827 }, | |
| 828 "p.value" = { | |
| 829 rslt$score_list[[i]]$p.value | |
| 830 }, | |
| 831 stop( | |
| 832 sprintf( | |
| 833 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", | |
| 834 ksea_cutoff_statistic | |
| 835 ) | |
| 836 ) | |
| 837 ) | |
| 838 if (sum(selector < ksea_cutoff_threshold) > 0) { | |
| 839 math_caption <- gsub("{", "\\{", caption, fixed = TRUE) | |
| 840 math_caption <- gsub("}", "\\}", math_caption, fixed = TRUE) | |
| 841 data_frame_latex( | |
| 842 x = output_df, | |
| 843 justification = "l c c c c c c", | |
| 844 centered = TRUE, | |
| 845 caption = sprintf( | |
| 846 "\\text{%s}, %s < %s", | |
| 847 math_caption, | |
| 848 ksea_cutoff_statistic, | |
| 849 ksea_cutoff_threshold | |
| 850 ), | |
| 851 anchor = const_table_anchor_p | |
| 852 ) | |
| 853 } else { | |
| 854 cat( | |
| 855 sprintf( | |
| 856 "\\break | |
| 857 No kinases had | |
| 858 \\(\\text{%s}_\\text{enrichment} < %s\\) | |
| 859 for contrast %s\\hfill\\break\n", | |
| 860 ksea_cutoff_statistic, | |
| 861 ksea_cutoff_threshold, | |
| 862 caption | |
| 863 ) | |
| 864 ) | |
| 865 } | 2241 } |
| 2242 } else { | |
| 2243 "" | |
| 866 } | 2244 } |
| 867 } | 2245 } |
| 868 | 2246 |
| 869 # create_breaks is a helper for ksea_heatmap | 2247 # create_breaks is a helper for ksea_heatmap |
| 870 create_breaks <- function(merged_scores) { | 2248 create_breaks <- function(merged_scores) { |
| 905 mycol <- unique(append(mycol_neg, mycol_pos)) | 2283 mycol <- unique(append(mycol_neg, mycol_pos)) |
| 906 color_breaks <- list(breaks_all, mycol) | 2284 color_breaks <- list(breaks_all, mycol) |
| 907 return(color_breaks) | 2285 return(color_breaks) |
| 908 } | 2286 } |
| 909 | 2287 |
| 2288 hm2plus <- function( | |
| 2289 x, | |
| 2290 mat = matrix( | |
| 2291 c( | |
| 2292 c(0, 4, 0), | |
| 2293 c(0, 3, 3), | |
| 2294 c(2, 1, 1) | |
| 2295 ), | |
| 2296 nrow = 3, | |
| 2297 ncol = 3, | |
| 2298 byrow = TRUE | |
| 2299 ), | |
| 2300 denwid = 0.5, | |
| 2301 denhgt = 0.15, | |
| 2302 widths = c(0.5, 2.5, 1.5), | |
| 2303 heights = c(0.4, 0.15, 3.95), | |
| 2304 divergent = FALSE, | |
| 2305 notecol = "grey50", | |
| 2306 trace = "none", | |
| 2307 margins = c(6, 20), | |
| 2308 srtcol = 90, | |
| 2309 srtrow = 0, | |
| 2310 density_info = "none", | |
| 2311 key_xlab = latex2exp::TeX("$log_{10}$(peptide intensity)"), | |
| 2312 key_par = list(), | |
| 2313 hclustfun = hclust, | |
| 2314 ... | |
| 2315 ) { | |
| 2316 | |
| 2317 varargs <- list(...) | |
| 2318 if (FALSE) # this is to avoid commenting out code to pass linting... | |
| 2319 my_hm2 <- latex_show_invocation(heatmap.2, head_patch = FALSE) | |
| 2320 else | |
| 2321 my_hm2 <- heatmap.2 | |
| 2322 | |
| 2323 x <- as.matrix(x) | |
| 2324 if (sum(!is.na(x)) < 1) | |
| 2325 return(NULL) | |
| 2326 color_count <- 1 + max(64, length(as.vector(x))) # 8 was not enough | |
| 2327 break_count <- 1 + color_count | |
| 2328 min_nonax <- min(x, na.rm = TRUE) | |
| 2329 max_nonax <- max(x, na.rm = TRUE) | |
| 2330 if (print_nb_messages) nb("within hm2plus", see_variable(divergent), "\n") | |
| 2331 if (divergent) { | |
| 2332 zlim <- max(abs(min_nonax), abs(max_nonax)) | |
| 2333 if (print_nb_messages) nb(see_variable(pre_zlim <- zlim, "\n")) | |
| 2334 breaks <- (zlim) / (break_count:1) | |
| 2335 if (print_nb_messages) nb(see_variable(breaks, "\n")) | |
| 2336 breaks <- breaks - median(breaks) | |
| 2337 zlim <- c(-zlim, zlim) | |
| 2338 if (print_nb_messages) nb(see_variable(zlim, "\n")) | |
| 2339 } else { | |
| 2340 zlim <- max(abs(min_nonax), abs(max_nonax)) | |
| 2341 if (print_nb_messages) nb(see_variable(pre_zlim <- zlim, "\n")) | |
| 2342 breaks <- zlim / (break_count:1) | |
| 2343 if (print_nb_messages) nb(see_variable(breaks, "\n")) | |
| 2344 if (max_nonax < 0) { | |
| 2345 breaks <- breaks - zlim | |
| 2346 zlim <- c(-zlim, 0) | |
| 2347 } else { | |
| 2348 zlim <- c(0, zlim) | |
| 2349 } | |
| 2350 if (print_nb_messages) nb(see_variable(zlim, "\n")) | |
| 2351 } | |
| 2352 nonax <- x | |
| 2353 nonax[is.na(x)] <- min_nonax | |
| 2354 if (is.null(widths)) widths <- c(denwid, 4 - denwid, 1.5) | |
| 2355 if (is.null(heights)) heights <- c(0.4, denhgt, 4.0) | |
| 2356 colors <- | |
| 2357 if (divergent && min_nonax < 0) { | |
| 2358 # divergent colors on both sides of zero | |
| 2359 colorRampPalette(c("red", "white", "blue"))(color_count) | |
| 2360 } else if (divergent && min_nonax > 0) { | |
| 2361 # "divergent" colors > zero | |
| 2362 colorRampPalette(c("white", "blue"))(color_count) | |
| 2363 } else if (divergent && max_nonax < 0) { | |
| 2364 # "divergent" colors < zero | |
| 2365 colorRampPalette(c("red", "white"))(color_count) | |
| 2366 } else { | |
| 2367 # "non-divergent" colors including zero | |
| 2368 hcl.colors(color_count, "YlOrRd", rev = TRUE) | |
| 2369 } | |
| 2370 | |
| 2371 #ACE if (print_nb_messages) nb("within hm2plus", see_variable(key_par), "\n") | |
| 2372 #ACE if (print_nb_messages) nb(see_variable(colors, "\n")) | |
| 2373 #ACE key_par$col = colors | |
| 2374 #ACE key_par$breaks = breaks | |
| 2375 | |
| 2376 if (print_nb_messages) nb(see_variable(par(), "\n")) #ACE TODO remove me | |
| 2377 if (print_nb_messages) cat("\\leavevmode\n\\linebreak\n") #ACE TODO remove me | |
| 2378 suppressWarnings( | |
| 2379 my_hm2( | |
| 2380 x = x, | |
| 2381 col = colors, | |
| 2382 #ACE symkey = FALSE, | |
| 2383 density.info = density_info, | |
| 2384 srtCol = srtcol, | |
| 2385 srtRow = srtrow, | |
| 2386 margins = margins, | |
| 2387 lwid = widths, | |
| 2388 lhei = heights, | |
| 2389 key.title = NA, | |
| 2390 key.xlab = key_xlab, | |
| 2391 key.par = key_par, | |
| 2392 lmat = mat, | |
| 2393 notecol = notecol, | |
| 2394 trace = trace, | |
| 2395 bg = "yellow", | |
| 2396 hclustfun = hclustfun, | |
| 2397 #ACE breaks = breaks, | |
| 2398 oldstyle = FALSE, | |
| 2399 ... # varargs | |
| 2400 ) | |
| 2401 ) | |
| 2402 # implicitly returning value returned by heatmap.2 | |
| 2403 } | |
| 2404 | |
| 910 # draw_kseaapp_summary_heatmap is a helper function for ksea_heatmap | 2405 # draw_kseaapp_summary_heatmap is a helper function for ksea_heatmap |
| 911 draw_kseaapp_summary_heatmap <- function( | 2406 draw_kseaapp_summary_heatmap <- function( |
| 912 x, | 2407 x, # matrix with row/col names already formatted |
| 913 sample_cluster, | 2408 sample_cluster, # a binary input of TRUE or FALSE, |
| 914 merged_asterisk, | 2409 # indicating whether or not to perform |
| 915 my_cex_row, | 2410 # hierarchical clustering of the sample columns |
| 916 color_breaks, | 2411 merged_asterisk, # matrix having dimensions of x, values "*" or "" |
| 917 margins, | 2412 color_breaks, # breaks for color gradation, from create_breaks |
| 2413 # passed to `breaks` argument of `image` | |
| 2414 margins = c(8, 15), # two integers setting the bottom and right margins | |
| 2415 # to accommodate row and column labels | |
| 2416 master_cex = 0.7, # basis for text sizes | |
| 918 ... | 2417 ... |
| 919 ) { | 2418 ) { |
| 920 merged_scores <- x | 2419 merged_scores <- x |
| 921 if (!is.matrix(x)) { | 2420 if (!is.matrix(x)) { |
| 922 cat( | 2421 cat( |
| 924 "No plot because \\texttt{typeof(x)} is '", | 2423 "No plot because \\texttt{typeof(x)} is '", |
| 925 typeof(x), | 2424 typeof(x), |
| 926 "' rather than 'matrix'.\n\n" | 2425 "' rather than 'matrix'.\n\n" |
| 927 ) | 2426 ) |
| 928 ) | 2427 ) |
| 929 } else if (nrow(x) < 2) { | 2428 cat_variable(x) |
| 930 cat("No plot because matrix has ", nrow(x), " rows.\n\n") | |
| 931 return(FALSE) | 2429 return(FALSE) |
| 932 } else if (ncol(x) < 2) { | 2430 } |
| 933 cat("No plot because matrix x has ", ncol(x), " columns.\n\n") | 2431 if (print_trace_messages) cat(sprintf("master_cex = %03f\n\n", master_cex)) |
| 2432 nrow_x <- nrow(x) | |
| 2433 ncol_x <- ncol(x) | |
| 2434 #if (nrow_x < 2) { | |
| 2435 if (nrow_x < 1) { | |
| 2436 cat("No plot because matrix has no rows.\n\n") | |
| 934 return(FALSE) | 2437 return(FALSE) |
| 935 } else { | 2438 } else if (nrow_x < 2) { |
| 936 my_limit <- 25 | 2439 cat("No plot because matrix has one row. Matrix looks like this:\n\n") |
| 937 my_cex_col <- my_limit / (my_limit + ncol(x)) | 2440 cat("\n\\begin{verbatim}\n") |
| 938 my_cex_row <- my_limit / (my_limit + nrow(x)) | 2441 print(x) |
| 939 my_scale <- 12.0 | 2442 cat("\n\\end{verbatim}\n") |
| 940 if (ncol(x) < 10 && nrow(x) < 10) | 2443 return(FALSE) |
| 941 my_scale <- my_scale * 10 / (10 - nrow(x)) * 10 / (10 - ncol(x)) | 2444 } else if (ncol_x < 2) { |
| 942 gplots::heatmap.2( | 2445 cat("No plot because matrix x has ", ncol_x, " columns.\n\n") |
| 943 x = merged_scores, | 2446 cat_variable(x) |
| 944 Colv = sample_cluster, | 2447 return(FALSE) |
| 945 breaks = color_breaks[[1]], | 2448 } |
| 946 cellnote = merged_asterisk, | 2449 max_nchar_rowname <- max(nchar(rownames(x))) |
| 947 cexCol = 0.9 * my_cex_col, | 2450 max_nchar_colname <- max(nchar(colnames(x))) |
| 948 cexRow = 2 * my_cex_row, | 2451 my_limit <- g_intensity_hm_rows |
| 949 col = color_breaks[[2]], | 2452 |
| 950 density.info = "none", | 2453 my_row_cex_scale <- master_cex * 150 / nrow_x |
| 951 key = FALSE, | 2454 my_col_cex_scale <- 3.0 |
| 952 lhei = c(0.4, 8.0, 1.1), | 2455 my_asterisk_scale <- 0.4 * my_row_cex_scale |
| 953 lmat = rbind(c(0, 3), c(2, 1), c(0, 4)), | 2456 my_row_warp <- 1 |
| 954 lwid = c(0.5, 3), | 2457 my_note_warp <- 2 |
| 955 margins = margins, | 2458 my_row_warp <- 1 |
| 956 notecex = my_scale * my_cex_row * my_cex_col, | 2459 my_row_cex_asterisk <- |
| 957 notecol = "white", | 2460 master_cex * my_row_warp * my_asterisk_scale |
| 958 scale = "none", | 2461 |
| 959 srtCol = 45, | 2462 |
| 960 srtRow = 45, | 2463 my_col_cex <- my_col_cex_scale * master_cex |
| 961 trace = "none", | 2464 my_row_cex <- min(3.5 * my_row_cex_asterisk, my_col_cex) |
| 962 ... | 2465 my_key_cex <- 1.286 |
| 963 ) | 2466 my_hm2_cex <- 1 * master_cex |
| 964 return(TRUE) | 2467 my_offset <- (4.8 / (9 + nrow_x / 10)) - 0.4 |
| 965 } | 2468 if (print_trace_messages) cat(sprintf("nrow_x = %03f\n\n", nrow_x)) |
| 966 } | 2469 if (print_trace_messages) cat(sprintf("my_offset = %03f\n\n", my_offset)) |
| 967 | 2470 my_offset <- 0.05 |
| 968 # Adapted from KSEAapp::KSEA.Heatmap | 2471 if (print_trace_messages) cat(sprintf("my_offset = %03f\n\n", my_offset)) |
| 2472 my_scale <- 3.0 | |
| 2473 if (ncol_x < 10 && nrow_x < 10) | |
| 2474 my_scale <- my_scale * 10 / (10 - nrow_x) * 10 / (10 - ncol_x) | |
| 2475 | |
| 2476 my_heights <- c( | |
| 2477 0.15, | |
| 2478 3.85 - my_offset, | |
| 2479 0.5 + my_offset | |
| 2480 ) | |
| 2481 my_margins <- c(1, 1) + | |
| 2482 c( | |
| 2483 margins[1] * 0.08 * max_nchar_colname * my_col_cex, | |
| 2484 margins[2] * 0.04 * max_nchar_rowname * my_row_cex | |
| 2485 ) | |
| 2486 | |
| 2487 my_notecex <- | |
| 2488 my_scale * | |
| 2489 min( | |
| 2490 1.1, | |
| 2491 my_row_cex_asterisk * my_note_warp, | |
| 2492 my_col_cex * my_note_warp | |
| 2493 ) | |
| 2494 | |
| 2495 if (print_trace_messages) { | |
| 2496 cat_variable(my_heights, suffix = "; ") | |
| 2497 cat_variable(my_margins, suffix = "\n\n") | |
| 2498 cat_variable(my_row_cex_scale, suffix = "; ") | |
| 2499 cat_variable(my_col_cex_scale, suffix = "\n\n") | |
| 2500 cat_variable(my_row_cex_asterisk, suffix = "\n\n") | |
| 2501 cat_variable(my_row_cex, suffix = "; ") | |
| 2502 cat_variable(my_col_cex, suffix = "\n\n") | |
| 2503 cat_variable(my_row_cex, suffix = "; ") | |
| 2504 cat_variable(my_col_cex, suffix = "\n\n") | |
| 2505 } | |
| 2506 | |
| 2507 hm2plus( | |
| 2508 x = merged_scores, | |
| 2509 Colv = sample_cluster, | |
| 2510 cellnote = merged_asterisk, | |
| 2511 cex = my_hm2_cex, | |
| 2512 cexCol = my_col_cex, | |
| 2513 cexRow = my_row_cex, | |
| 2514 denhgt = 0.15, | |
| 2515 density_info = "none", | |
| 2516 denwid = 0.5, | |
| 2517 divergent = TRUE, | |
| 2518 key_par = list(cex = my_key_cex), | |
| 2519 key_xlab = "Z-score", | |
| 2520 margins = my_margins, | |
| 2521 notecex = my_scale * min( | |
| 2522 1.5, | |
| 2523 my_row_cex_asterisk * my_note_warp, | |
| 2524 my_col_cex * my_note_warp | |
| 2525 ), | |
| 2526 notecol = "white", | |
| 2527 scale = "none", | |
| 2528 srtcol = 90, | |
| 2529 srtrow = 0, | |
| 2530 trace = "none", | |
| 2531 mat = matrix( | |
| 2532 c( | |
| 2533 c(0, 3, 3), | |
| 2534 c(2, 1, 1), | |
| 2535 c(0, 4, 0) | |
| 2536 ), | |
| 2537 nrow = 3, | |
| 2538 ncol = 3, | |
| 2539 byrow = TRUE | |
| 2540 ), | |
| 2541 widths = c(0.5, 3.1, 0.9), | |
| 2542 heights = my_heights, | |
| 2543 ... | |
| 2544 ) | |
| 2545 return(TRUE) | |
| 2546 } | |
| 2547 | |
| 2548 # function drawing heatmap of contrast fold-change for each kinase, | |
| 2549 # adapted from KSEAapp::KSEA.Heatmap | |
| 969 ksea_heatmap <- function( | 2550 ksea_heatmap <- function( |
| 970 # the data frame outputs from the KSEA.Scores() function, in list format | 2551 # the data frame outputs from the KSEA.Scores() function, in list format |
| 971 score_list, | 2552 score_list, |
| 972 # a character vector of all the sample names for heatmap annotation: | 2553 # a character vector of all the sample names for heatmap annotation: |
| 973 # - the names must be in the same order as the data in score_list | 2554 # - the names must be in the same order as the data in score_list |
| 979 # a numeric value between 0 and infinity indicating the min. number of | 2560 # a numeric value between 0 and infinity indicating the min. number of |
| 980 # substrates a kinase must have to be included in the heatmap | 2561 # substrates a kinase must have to be included in the heatmap |
| 981 m_cutoff, | 2562 m_cutoff, |
| 982 # a numeric value between 0 and 1 indicating the p-value/FDR cutoff | 2563 # a numeric value between 0 and 1 indicating the p-value/FDR cutoff |
| 983 # for indicating significant kinases in the heatmap | 2564 # for indicating significant kinases in the heatmap |
| 984 p_cutoff = | 2565 p_cutoff = { |
| 985 stop("argument 'p_cutoff' is required for function 'ksea_heatmap'"), | 2566 cat("argument 'p_cutoff' is required for function 'ksea_heatmap'") |
| 2567 param_df_exit() | |
| 2568 knitr::knit_exit() | |
| 2569 }, | |
| 986 # a binary input of TRUE or FALSE, indicating whether or not to perform | 2570 # a binary input of TRUE or FALSE, indicating whether or not to perform |
| 987 # hierarchical clustering of the sample columns | 2571 # hierarchical clustering of the sample columns |
| 988 sample_cluster, | 2572 sample_cluster, |
| 989 # a binary input of TRUE or FALSE, indicating whether or not to export | 2573 # a binary input of TRUE or FALSE, indicating whether or not to export |
| 990 # the heatmap as a .png image into the working directory | 2574 # the heatmap as a .png image into the working directory |
| 991 export = FALSE, | 2575 export = FALSE, |
| 992 # bottom and right margins; adjust as needed if contrast names are too long | 2576 # bottom and right margins; adjust as needehttps://tex.stackexchange.com/a/56795d if contrast names are too long |
| 993 margins = c(6, 20), | 2577 margins = c(6, 6), |
| 994 # print which kinases? | 2578 # print which kinases? |
| 995 # - Mandatory argument, must be one of const_ksea_.*_kinases | 2579 # - Mandatory argument, must be one of const_ksea_.*_kinases |
| 996 which_kinases, | 2580 which_kinases, |
| 997 # additional arguments to gplots::heatmap.2, such as: | 2581 # additional arguments to gplots::heatmap.2, such as: |
| 998 # - main: main title of plot | 2582 # - main: main title of plot |
| 1037 } | 2621 } |
| 1038 return(new) | 2622 return(new) |
| 1039 } | 2623 } |
| 1040 merged_asterisk <- as.matrix(asterisk(merged_stats, p_cutoff)) | 2624 merged_asterisk <- as.matrix(asterisk(merged_stats, p_cutoff)) |
| 1041 | 2625 |
| 1042 # begin hack to print only significant rows | |
| 1043 asterisk_rows <- rowSums(merged_asterisk == "*") > 0 | 2626 asterisk_rows <- rowSums(merged_asterisk == "*") > 0 |
| 1044 all_rows <- rownames(merged_stats) | 2627 all_rows <- rownames(merged_stats) |
| 1045 names(asterisk_rows) <- all_rows | 2628 names(asterisk_rows) <- all_rows |
| 1046 non_asterisk_rows <- names(asterisk_rows[asterisk_rows == FALSE]) | 2629 non_asterisk_rows <- names(asterisk_rows[asterisk_rows == FALSE]) |
| 1047 asterisk_rows <- names(asterisk_rows[asterisk_rows == TRUE]) | 2630 asterisk_rows <- names(asterisk_rows[asterisk_rows == TRUE]) |
| 1048 merged_scores_asterisk <- merged_scores[names(asterisk_rows), , drop = FALSE] | 2631 merged_scores_asterisk <- merged_scores[names(asterisk_rows), , drop = FALSE] |
| 1049 merged_scores_non_asterisk <- merged_scores[names(non_asterisk_rows), , drop = FALSE] | 2632 merged_scores_non_asterisk <- merged_scores[names(non_asterisk_rows), , drop = FALSE] |
| 1050 # end hack to print only significant rows | |
| 1051 | 2633 |
| 1052 row_list <- list() | 2634 row_list <- list() |
| 1053 row_list[[const_ksea_astrsk_kinases]] <- asterisk_rows | 2635 row_list[[const_ksea_astrsk_kinases]] <- asterisk_rows |
| 1054 row_list[[const_ksea_all_kinases]] <- all_rows | 2636 row_list[[const_ksea_all_kinases]] <- all_rows |
| 1055 row_list[[const_ksea_nonastrsk_kinases]] <- non_asterisk_rows | 2637 row_list[[const_ksea_nonastrsk_kinases]] <- non_asterisk_rows |
| 1060 stts <- merged_stats[my_row_names, , drop = FALSE] | 2642 stts <- merged_stats[my_row_names, , drop = FALSE] |
| 1061 merged_asterisk <- as.matrix(asterisk(stts, p_cutoff)) | 2643 merged_asterisk <- as.matrix(asterisk(stts, p_cutoff)) |
| 1062 | 2644 |
| 1063 color_breaks <- create_breaks(scrs) | 2645 color_breaks <- create_breaks(scrs) |
| 1064 if (is.null(color_breaks)) { | 2646 if (is.null(color_breaks)) { |
| 1065 cat("No plot because matrix has too many missing values.\n\n") | 2647 cat("No plot because matrix has too few rows.\n\n") |
| 1066 return(NULL) | 2648 return(NULL) |
| 1067 } | 2649 } |
| 1068 plot_height <- nrow(scrs) ^ 0.55 | 2650 plot_height <- nrow(scrs) ^ 0.55 |
| 1069 plot_width <- ncol(scrs) ^ 0.7 | 2651 plot_width <- ncol(scrs) ^ 0.7 |
| 1070 my_cex_row <- 0.25 * 16 / plot_height | |
| 1071 if (export == "TRUE") { | 2652 if (export == "TRUE") { |
| 1072 png( | 2653 png( |
| 1073 "KSEA.Merged.Heatmap.png", | 2654 "KSEA.Merged.Heatmap.png", |
| 1074 width = plot_width * 300, | 2655 width = plot_width * 300, |
| 1075 height = 2 * plot_height * 300, | 2656 height = 2 * plot_height * 300, |
| 1079 } | 2660 } |
| 1080 did_draw <- draw_kseaapp_summary_heatmap( | 2661 did_draw <- draw_kseaapp_summary_heatmap( |
| 1081 x = scrs, | 2662 x = scrs, |
| 1082 sample_cluster = sample_cluster, | 2663 sample_cluster = sample_cluster, |
| 1083 merged_asterisk = merged_asterisk, | 2664 merged_asterisk = merged_asterisk, |
| 1084 my_cex_row = my_cex_row, | |
| 1085 color_breaks = color_breaks, | 2665 color_breaks = color_breaks, |
| 1086 margins = margins | 2666 margins = margins |
| 1087 ) | 2667 ) |
| 1088 if (export == "TRUE") { | 2668 if (export == "TRUE") { |
| 1089 dev.off() | 2669 dev.off() |
| 1091 if (!did_draw) | 2671 if (!did_draw) |
| 1092 return(NULL) | 2672 return(NULL) |
| 1093 return(my_row_names) | 2673 return(my_row_names) |
| 1094 } | 2674 } |
| 1095 | 2675 |
| 1096 # helper for heatmaps of phosphopeptide intensities | 2676 # helpers for heatmaps of phosphopeptide intensities |
| 1097 | 2677 |
| 1098 draw_ppep_heatmap <- | 2678 # factory producing function to truncate string after n characters |
| 2679 trunc_n <- function(n) { | |
| 2680 function(x) { | |
| 2681 sapply( | |
| 2682 X = x, | |
| 2683 FUN = function(s) { | |
| 2684 if (is.na(s)) | |
| 2685 return("NA") | |
| 2686 cond <- try_catch_w_e(nchar(s) > n) | |
| 2687 if (!is.logical(cond$value)) { | |
| 2688 return(cond$value$message) | |
| 2689 } else if (cond$value) { | |
| 2690 paste0( | |
| 2691 strtrim(s, n), | |
| 2692 "..." | |
| 2693 ) | |
| 2694 } else { | |
| 2695 s | |
| 2696 } | |
| 2697 }, | |
| 2698 USE.NAMES = FALSE | |
| 2699 ) | |
| 2700 } | |
| 2701 } | |
| 2702 trunc_long_ppep <- function(x) trunc_n(40)(x) | |
| 2703 trunc_ppep <- function(x) trunc_n(g_ppep_trunc_n)(x) | |
| 2704 trunc_subgene <- function(x) trunc_n(g_subgene_trunc_n)(x) | |
| 2705 trunc_enriched_substrate <- function(x) trunc_n(g_sbstr_trunc_n)(x) | |
| 2706 | |
| 2707 # factory producing a function that returns a covariance | |
| 2708 # matrix's rows (and columns) having variance > v_min | |
| 2709 keep_cov_w_var_gtr_min <- function(v_min) { | |
| 2710 function(x) { | |
| 2711 if (!is.matrix(x)) | |
| 2712 return(NULL) | |
| 2713 keepers <- sapply( | |
| 2714 X = seq_len(nrow(x)), | |
| 2715 FUN = function(i) { | |
| 2716 if (x[i, i] < v_min) | |
| 2717 NA | |
| 2718 else | |
| 2719 x[i, i] | |
| 2720 } | |
| 2721 ) | |
| 2722 names(keepers) <- rownames(x) | |
| 2723 keepers <- keepers[!is.na(keepers)] | |
| 2724 keepers <- names(keepers) | |
| 2725 if (length(keepers) == 0) | |
| 2726 return(NULL) | |
| 2727 x[keepers, keepers] | |
| 2728 } | |
| 2729 } | |
| 2730 # function that returns a matrix's rows having variance > 1 | |
| 2731 keep_cov_w_var_gtr_1 <- keep_cov_w_var_gtr_min(1) | |
| 2732 | |
| 2733 # factory producing a function that returns | |
| 2734 # - either a matrix's rows (rows = TRUE) | |
| 2735 # - or a matrix's columns (rows = FALSE) | |
| 2736 # having variance > v_min | |
| 2737 keep_var_gtr_min <- function(v_min) { | |
| 2738 function(x, rows = TRUE) { | |
| 2739 nrowcol <- if (rows) nrow else ncol | |
| 2740 if (!is.matrix(x)) | |
| 2741 return(NULL) | |
| 2742 keepers <- sapply( | |
| 2743 X = seq_len(nrowcol(x)), | |
| 2744 FUN = function(i) { | |
| 2745 row_var <- var( | |
| 2746 if (rows) x[i, ] else x[, i], | |
| 2747 na.rm = TRUE | |
| 2748 ) | |
| 2749 if (is.na(row_var) || row_var <= v_min) NA else i | |
| 2750 } | |
| 2751 ) | |
| 2752 keepers <- keepers[!is.na(keepers)] | |
| 2753 if (rows) x[keepers, ] else x[, keepers] | |
| 2754 } | |
| 2755 } | |
| 2756 | |
| 2757 keep_var_gtr_0 <- keep_var_gtr_min(0) | |
| 2758 | |
| 2759 # function drawing heatmap of phosphopeptide intensities | |
| 2760 ppep_heatmap <- | |
| 1099 function( | 2761 function( |
| 1100 m, # matrix with rownames already formatted | 2762 m, # matrix with rownames already formatted |
| 1101 cutoff, # cutoff used by hm_heading_function | 2763 cutoff, # cutoff used by hm_heading_function |
| 1102 hm_heading_function, # construct and cat heading from m and cutoff | 2764 hm_heading_function, # construct $ cat heading from m and cutoff |
| 1103 hm_main_title, # main title for plot (drawn below heading) | 2765 hm_main_title, # main title for plot (drawn below heading) |
| 1104 suppress_row_dendrogram = TRUE, # set to false to show dendrogram | 2766 suppress_row_dendrogram = TRUE, # set to false to show dendrogram |
| 1105 max_peptide_count # experimental: | 2767 max_peptide_count = # experimental: |
| 1106 = intensity_hm_rows, # values of 50 and 75 worked well | 2768 g_intensity_hm_rows, # values of 50 and 75 worked well |
| 1107 ... # passthru parameters for heatmap | 2769 master_cex = 1.0, # basis for text sizes |
| 2770 margins = NULL, # optional margins (bottom, right) | |
| 2771 cellnote = NULL, # optional matrix of character; dim = dim(m) | |
| 2772 adj = 0.5, # adjust text: 0 left, 0.5 middle, 1 right | |
| 2773 ... # passthru to hm2plus or heatmap.2 | |
| 1108 ) { | 2774 ) { |
| 2775 use_heatmap_1 <- FALSE | |
| 1109 peptide_count <- 0 | 2776 peptide_count <- 0 |
| 1110 # emit the heading for the heatmap | 2777 # emit the heading for the heatmap |
| 1111 if (hm_heading_function(m, cutoff)) { | 2778 if (hm_heading_function(m, cutoff)) { |
| 1112 peptide_count <- min(max_peptide_count, nrow(m)) | 2779 nrow_m <- nrow(m) |
| 1113 if (nrow(m) > 0) { | 2780 peptide_count <- min(max_peptide_count, nrow_m) |
| 2781 if (nrow_m > 1) { | |
| 1114 m_margin <- m[peptide_count:1, ] | 2782 m_margin <- m[peptide_count:1, ] |
| 1115 # Margin setting was heuristically derived | 2783 # Margin was heuristically derived to accommodate the widest label |
| 1116 margins <- | 2784 row_mchar_max <- max(nchar(rownames(m_margin))) |
| 1117 c(0.5, # col | 2785 col_mchar_max <- max(nchar(colnames(m_margin))) |
| 1118 max(80, sqrt(nchar(rownames(m_margin)))) * 5 / 16 # row | 2786 row_margin <- master_cex * row_mchar_max * 2.6 |
| 1119 ) | 2787 col_margin <- master_cex * col_mchar_max * 2.6 |
| 1120 } | 2788 if (print_trace_messages) cat(sprintf("row_margin = %0.3f; ", row_margin)) |
| 1121 if (nrow(m) > 0) { | 2789 if (print_trace_messages) cat(sprintf("col_margin = %0.3f; ", col_margin)) |
| 1122 hm_call <- NULL | 2790 hm_call <- NULL |
| 1123 tryCatch( | 2791 tryCatch( |
| 1124 { | 2792 { |
| 1125 old_oma <- par("oma") | 2793 # set non-argument parameters for hm_call inner function |
| 1126 par(cex.main = 0.6) | 2794 my_row_cex <- |
| 1127 # Heuristically determined character size adjustment formula | 2795 master_cex * 200000 / ( |
| 1128 my_cex_row <- | 2796 (max(nchar(rownames(m_margin)))^2) * g_intensity_hm_rows |
| 1129 250000 / ( | |
| 1130 max(4500, (nchar(rownames(m_margin)))^2) * intensity_hm_rows | |
| 1131 ) | 2797 ) |
| 1132 m_hm <- m[peptide_count:1, , drop = FALSE] | 2798 m_hm <- m[peptide_count:1, , drop = FALSE] |
| 1133 my_limit <- 60 | 2799 if (is.null(cellnote)) { |
| 1134 my_cex_col <- 0.75 * my_limit / (my_limit + ncol(m_hm)) | 2800 cellnote <- matrix("", nrow = nrow(m_hm), ncol = ncol(m_hm)) |
| 2801 cellnote[is.na(m_hm)] <- "NA" | |
| 2802 } else { | |
| 2803 cellnote <- cellnote[peptide_count:1, , drop = FALSE] | |
| 2804 } | |
| 2805 m_hm[is.na(m_hm)] <- 0 | |
| 2806 nrow_m_hm <- nrow(m_hm) | |
| 2807 ncol_m_hm <- ncol(m_hm) | |
| 2808 if (nrow_m_hm < 1 || ncol_m_hm < 1) | |
| 2809 return(peptide_count) # return zero as initialized above | |
| 2810 my_limit <- g_intensity_hm_rows | |
| 2811 | |
| 2812 | |
| 2813 my_row_cex <- master_cex * (100 / (2 + row_mchar_max)) | |
| 2814 my_col_cex <- master_cex * 6 * row_margin / col_margin | |
| 2815 my_col_adj <- min(my_col_cex, my_row_cex) / my_col_cex | |
| 2816 my_col_cex <- min(my_col_cex, my_row_cex) | |
| 2817 col_margin <- sqrt(my_col_adj) * col_margin | |
| 2818 if (print_trace_messages) cat(sprintf("my_row_cex = %0.3f; ", my_row_cex)) | |
| 2819 if (print_trace_messages) cat(sprintf("my_col_cex = %0.3f; ", my_col_cex)) | |
| 2820 if (is.null(margins)) my_margins <- | |
| 2821 c( | |
| 2822 (my_col_cex + col_margin), # col | |
| 2823 (my_row_cex + row_margin) / my_row_cex # row | |
| 2824 ) | |
| 2825 else | |
| 2826 my_margins <- margins | |
| 2827 | |
| 2828 if (print_trace_messages) cat( | |
| 2829 sprintf( | |
| 2830 "my_margins = c(%s)\n\n", | |
| 2831 paste(my_margins, collapse = ", ") | |
| 2832 ) | |
| 2833 ) | |
| 2834 my_hm2_cex <- 2 * master_cex | |
| 2835 my_key_cex <- 0.9 - 0.1 * (g_intensity_hm_rows + nrow_m_hm) / g_intensity_hm_rows | |
| 2836 my_key_warp <- 1.5 * 22.75 / row_margin | |
| 2837 my_key_cex <- min(1.10, my_key_warp * my_key_cex) | |
| 2838 my_hgt_scale <- 3.70 - 0.4 * (max(1, 0.9 * my_row_cex) - 1) | |
| 2839 my_hgt_scale <- 3.75 # 3.615 | |
| 2840 my_hgt_scale <- 3.60 # 3.615 | |
| 2841 if (print_trace_messages) | |
| 2842 cat_variable(my_hgt_scale, "\n\n", 3) | |
| 2843 my_warp <- max(0.1, 1.4 * (7.5 + nrow_m) / g_intensity_hm_rows) | |
| 2844 if (print_trace_messages) | |
| 2845 cat_variable(my_warp, "\n\n", 3) | |
| 2846 # added 0.9 heuristically... | |
| 2847 my_plot_height <- | |
| 2848 (0.566 + 0.354 * (nrow_m / g_intensity_hm_rows)) * | |
| 2849 min(my_hgt_scale, my_hgt_scale * my_warp) | |
| 2850 my_plot_height <- min(3.65, my_plot_height * g_intensity_hm_rows / 50) | |
| 2851 my_heights <- c( | |
| 2852 0.3, # title and top dendrogram | |
| 2853 my_plot_height, # plot and bottom margin | |
| 2854 4.15 - my_hgt_scale, # legend | |
| 2855 0.05 + my_hgt_scale - my_plot_height # whitespace below legend | |
| 2856 ) | |
| 2857 my_note_cex <- min(0.8, my_row_cex, my_col_cex) | |
| 2858 if (print_trace_messages) { | |
| 2859 cat_variable(my_plot_height, "\n\n", 3) | |
| 2860 cat_variable(4.19 - my_hgt_scale, "\n\n", 3) | |
| 2861 cat_variable(nrow_m_hm, "; ", 0) | |
| 2862 cat_variable(ncol_m_hm, "; ", 0) | |
| 2863 cat_variable(my_row_cex, "; ", 3) | |
| 2864 cat_variable(my_col_cex, "; ", 3) | |
| 2865 cat_variable(my_note_cex, "; ", 3) | |
| 2866 cat_variable(my_key_cex, "\n\n", 3) | |
| 2867 cat_variable(my_hgt_scale, "; ", 3) | |
| 2868 cat_variable(my_plot_height, "; ", 3) | |
| 2869 cat_variable(my_warp, "\n\n", 3) | |
| 2870 cat_variable(my_heights, "; ", 2) | |
| 2871 cat_variable(sum(my_heights), "\n\n", 3) | |
| 2872 } | |
| 2873 | |
| 2874 # define hm_call inner function | |
| 1135 hm_call <- function(x, scaling, title) { | 2875 hm_call <- function(x, scaling, title) { |
| 1136 heatmap( | 2876 my_cex_main <- min(5.0, 220 / nchar(title)) |
| 1137 x, | 2877 op <- par( |
| 1138 Rowv = if (suppress_row_dendrogram) NA else NULL, | 2878 cex.main = my_cex_main * master_cex, |
| 1139 Colv = NA, | 2879 adj = adj |
| 1140 cexRow = my_cex_row, | 2880 ) |
| 1141 cexCol = my_cex_col, | 2881 if ( |
| 1142 scale = scaling, | 2882 !is.null( |
| 1143 margins = margins, | 2883 hm2plus( |
| 1144 main = title, | 2884 x, |
| 1145 xlab = "", | 2885 Colv = NA, |
| 1146 las = 1, | 2886 Rowv = TRUE, |
| 1147 ... | 2887 cexRow = my_row_cex, |
| 2888 cexCol = my_col_cex, | |
| 2889 dendrogram = "row", | |
| 2890 las = 1, | |
| 2891 main = title, | |
| 2892 key_xlab = latex2exp::TeX("$log_{10}$(peptide intensity)"), | |
| 2893 cex = my_hm2_cex, | |
| 2894 key_par = list(cex = my_key_cex), | |
| 2895 margins = my_margins, | |
| 2896 widths = c(0.4, 2.6, 1.5), | |
| 2897 heights = my_heights, | |
| 2898 mat = matrix( | |
| 2899 c( | |
| 2900 c(0, 3, 3), | |
| 2901 c(2, 1, 1), | |
| 2902 c(0, 4, 0), | |
| 2903 c(0, 0, 0) | |
| 2904 ), | |
| 2905 nrow = 4, | |
| 2906 ncol = 3, | |
| 2907 byrow = TRUE | |
| 2908 ), | |
| 2909 na.rm = TRUE, | |
| 2910 scale = scaling, | |
| 2911 srtcol = 90, | |
| 2912 srtrow = 0, | |
| 2913 xlab = "", | |
| 2914 cellnote = cellnote, | |
| 2915 notecex = my_note_cex, | |
| 2916 ... | |
| 2917 ) | |
| 1148 ) | 2918 ) |
| 2919 ) { | |
| 2920 if (print_trace_messages) cat( | |
| 2921 sprintf( | |
| 2922 "my_heights = c(%s); sum = %0.3f\n\n", | |
| 2923 paste( | |
| 2924 sprintf("%0.3f", my_heights), | |
| 2925 collapse = ", " | |
| 2926 ), | |
| 2927 sum(my_heights) | |
| 2928 ) | |
| 2929 ) | |
| 2930 if (print_trace_messages) cat( | |
| 2931 sprintf("my_key_cex = %0.3f\n\n", | |
| 2932 my_key_cex) | |
| 2933 ) | |
| 2934 if (print_trace_messages) cat( | |
| 2935 sprintf("my_key_cex/my_heights[3] = %0.3f\n\n", | |
| 2936 my_key_cex / my_heights[3]) | |
| 2937 ) | |
| 2938 if (print_trace_messages) cat( | |
| 2939 sprintf("my_heights[2]/my_heights[3] = %0.3f\n\n", | |
| 2940 my_heights[2] / my_heights[3]) | |
| 2941 ) | |
| 2942 } | |
| 2943 par(op) | |
| 1149 } | 2944 } |
| 2945 | |
| 2946 # invoke hm_call inner function | |
| 1150 if (sum(rowSums(!is.na(m_hm)) < 2)) | 2947 if (sum(rowSums(!is.na(m_hm)) < 2)) |
| 1151 hm_call( | 2948 hm_call( |
| 1152 m_hm, | 2949 m_hm, |
| 1153 "none", | 2950 "none", |
| 1154 "log(intensities), unscaled, unimputed, and unnormalized" | 2951 "log(intensities), unscaled, unimputed, and unnormalized" |
| 1168 if (nrow(m_hm) > 1) | 2965 if (nrow(m_hm) > 1) |
| 1169 hm_call( | 2966 hm_call( |
| 1170 m_hm, | 2967 m_hm, |
| 1171 "none", | 2968 "none", |
| 1172 paste( | 2969 paste( |
| 1173 "log(intensities), unscaled, unimputed,", | 2970 "log(intensities), unscaled,", |
| 1174 "NAs zeroed, unnormalized" | 2971 "zero-imputed, unnormalized" |
| 1175 ) | 2972 ) |
| 1176 ) | 2973 ) |
| 1177 else | 2974 else |
| 1178 cat("\nThere are too few peptides to produce a heatmap.\n") | 2975 cat("\nThere are too few peptides to produce a heatmap.\n") |
| 1179 }, | 2976 }, |
| 1180 error = function(r) { | 2977 error = function(r) { |
| 1181 cat( | 2978 cat( |
| 1182 sprintf( | 2979 sprintf( |
| 1183 "\n%s %s Internal message: %s\n", | 2980 "\n%s %s Internal message: %s\n\\newline\n\n", |
| 1184 "Could not draw heatmap,", | 2981 "Failure drawing heatmap,", |
| 1185 "possibly because of too many missing values.", | 2982 "possibly because of too many missing values.\n\\newline\n\n", |
| 1186 r$message | 2983 r$message |
| 1187 ) | 2984 ) |
| 1188 ) | 2985 ) |
| 2986 cat_margins() | |
| 1189 } | 2987 } |
| 1190 ) | 2988 ) |
| 1191 } else { | 2989 } else { |
| 1192 cat( | 2990 cat( |
| 1193 "\nCould not draw heatmap, possibly because of too many missing values.\n" | 2991 "\nFailure drawing heatmap, possibly because of too many missing values.\n" |
| 1194 ) | 2992 ) |
| 1195 } | 2993 } |
| 1196 }, | 2994 } |
| 1197 finally = par(old_oma) | |
| 1198 ) | 2995 ) |
| 1199 } | 2996 } |
| 1200 } | 2997 } |
| 1201 return(peptide_count) | 2998 # return value: |
| 1202 } | 2999 peptide_count |
| 3000 } | |
| 3001 | |
| 3002 # function drawing heatmap of correlations if they exist, else covariances | |
| 3003 cov_heatmap <- | |
| 3004 function( | |
| 3005 m, # matrix with rownames already formatted | |
| 3006 top_substrates = FALSE, | |
| 3007 ... # passthru to hm2plus or heatmap.2 | |
| 3008 ) { | |
| 3009 if (print_nb_messages) nbe(see_variable(m), " [", nrow(m), "x", ncol(m), "\n") | |
| 3010 #ACE nb(rowSums(m, na.rm = TRUE)) | |
| 3011 #ACE bad_rows <- (rowSums(m, na.rm = TRUE) == 0) | |
| 3012 #ACE nb(see_variable(bad_rows)) | |
| 3013 #ACE m <- m[-bad_rows, , drop = FALSE] | |
| 3014 colnames_m <- colnames(m) | |
| 3015 is_na_m <- is.na(m) | |
| 3016 tmp <- m | |
| 3017 tmp[is_na_m] <- 0 | |
| 3018 | |
| 3019 tmp <- m[, 0 < colSums(x = tmp)] # by default, na.rm is FALSE | |
| 3020 | |
| 3021 colnames_tmp <- colnames(tmp) | |
| 3022 | |
| 3023 my_low_p_seq <- seq( | |
| 3024 from = min(g_intensity_hm_rows, nrow(m)), | |
| 3025 to = 1, | |
| 3026 by = -1 | |
| 3027 ) | |
| 3028 | |
| 3029 if (g_correlate_substrates) { | |
| 3030 # zap samples having zero or near-zero variance | |
| 3031 tmp[is.na(tmp)] <- 0 | |
| 3032 nzv <- caret::nearZeroVar( | |
| 3033 tmp, # matrix of values, samples x variables | |
| 3034 freqCut = 1.01, # min(freq most prevalent value / | |
| 3035 # freq second most prevalent) | |
| 3036 uniqueCut = 99 # max(number of unique values / | |
| 3037 # total number of samples) | |
| 3038 ) | |
| 3039 tmp <- if (length(nzv) > 0) { | |
| 3040 m[, -nzv, drop = FALSE] | |
| 3041 } else { | |
| 3042 m | |
| 3043 } | |
| 3044 } else { | |
| 3045 tmp <- m[my_low_p_seq, , drop = FALSE] | |
| 3046 } | |
| 3047 | |
| 3048 | |
| 3049 t_m <- t(tmp) | |
| 3050 t_m[is.na(t_m)] <- 0 | |
| 3051 prefiltered_nrow <- ncol(t_m) | |
| 3052 | |
| 3053 my_corcov <- cov(t_m) | |
| 3054 did_filter_rows <- did_filter_cols <- FALSE | |
| 3055 if (g_correlate_substrates && !is_positive_definite(my_corcov)) { | |
| 3056 my_correlate_substrates <- FALSE | |
| 3057 t_m <- t(m[my_low_p_seq, , drop = FALSE]) | |
| 3058 t_m[is.na(t_m)] <- 0 | |
| 3059 unfiltered_row_count <- ncol(t_m) | |
| 3060 unfiltered_col_count <- nrow(t_m) | |
| 3061 | |
| 3062 # zap empty samples | |
| 3063 t_m <- t_m[0 < rowSums(x = t_m), ] | |
| 3064 # zap substrates present in fewer than two samples | |
| 3065 foo <- t_m > 0 | |
| 3066 foo <- colSums(x = foo) > 1 | |
| 3067 t_m <- t_m[, foo] | |
| 3068 | |
| 3069 did_filter_rows <- unfiltered_row_count > ncol(t_m) | |
| 3070 did_filter_cols <- unfiltered_col_count > nrow(t_m) | |
| 3071 | |
| 3072 colnames_tmp <- rownames(t_m) | |
| 3073 my_corcov <- cov(t_m) | |
| 3074 if (g_filter_cov_var_gt_1) { | |
| 3075 my_corcov <- keep_cov_w_var_gtr_1(my_corcov) | |
| 3076 } | |
| 3077 } else if (g_correlate_substrates) { | |
| 3078 my_corcov <- cov2cor(my_corcov) | |
| 3079 my_correlate_substrates <- TRUE | |
| 3080 } else { | |
| 3081 my_correlate_substrates <- FALSE | |
| 3082 if (g_filter_cov_var_gt_1) my_corcov <- keep_cov_w_var_gtr_1(my_corcov) | |
| 3083 } | |
| 3084 | |
| 3085 omitted_samples <- colnames_m[colnames_m %notin% colnames_tmp] | |
| 3086 suffix <- if (length(omitted_samples) > 1) "s" else "" | |
| 3087 | |
| 3088 f_omissions <- | |
| 3089 function(is_corr) { | |
| 3090 cat( | |
| 3091 sprintf( | |
| 3092 "Below is the %s plot for %s substrates", | |
| 3093 if (is_corr) "correlation" else "covariance", | |
| 3094 sprintf( | |
| 3095 if (top_substrates) | |
| 3096 "%0.0f \"highest-quality\"" | |
| 3097 else | |
| 3098 "%0.0f", | |
| 3099 ncol(t_m) | |
| 3100 ) | |
| 3101 ) | |
| 3102 ) | |
| 3103 if (did_filter_cols) { | |
| 3104 cat(sprintf(", omitting sample%s ", suffix)) | |
| 3105 latex_collapsed_vector(", ", omitted_samples) | |
| 3106 } | |
| 3107 cat(".\n\n") | |
| 3108 } | |
| 3109 | |
| 3110 if (is.null(my_corcov) || sum(!is.na(t_m)) < 2) { | |
| 3111 cat( | |
| 3112 sprintf( | |
| 3113 "\\newline\n%s %s plot.\n", | |
| 3114 "Insufficient covariance to produce", | |
| 3115 if (my_correlate_substrates) | |
| 3116 "correlation" | |
| 3117 else | |
| 3118 "covariance" | |
| 3119 ), | |
| 3120 "\\newpage\n" | |
| 3121 ) | |
| 3122 return(NULL) | |
| 3123 } | |
| 3124 | |
| 3125 cat("\\leavevmode\n", "\\newpage\n") | |
| 3126 f_omissions(my_correlate_substrates) | |
| 3127 | |
| 3128 master_cex <- 0.4 | |
| 3129 max_nchar <- max(nchar(rownames(t_m))) | |
| 3130 my_limit <- g_intensity_hm_rows | |
| 3131 diminution <- sqrt(my_limit / (my_limit + ncol(t_m))) | |
| 3132 my_row_cex <- | |
| 3133 my_col_cex <- | |
| 3134 min(1.75, master_cex * 9 * diminution ^ 1.5) | |
| 3135 my_margin <- 3 + my_row_cex * 64 / (8 + max_nchar) | |
| 3136 my_key_cex <- 1.4 | |
| 3137 my_hm2_cex <- 1.0 * master_cex | |
| 3138 my_hgt_scale <- 3.50 - 0.26 * (max(0.4, my_key_cex) - 0.4) | |
| 3139 my_hgt_scale <- 2.7 | |
| 3140 | |
| 3141 my_legend_height <- 4.0 - my_hgt_scale | |
| 3142 my_legend_height <- 0.5 * my_key_cex | |
| 3143 my_warp <- 0.65 * (my_limit + ncol(t_m)) / my_limit | |
| 3144 my_warp <- 0.8 | |
| 3145 my_legend_height <- 0.77 | |
| 3146 my_legend_height <- 0.67 | |
| 3147 my_plot_height <- my_hgt_scale + (1 - my_warp) * my_legend_height | |
| 3148 my_legend_height <- my_warp * my_legend_height | |
| 3149 | |
| 3150 parjust <- par(adj = 0.5) | |
| 3151 on.exit(par(parjust)) | |
| 3152 my_corcov <- my_corcov[order(rownames(my_corcov)), ] | |
| 3153 my_main <- | |
| 3154 sprintf("%s among %s substrates %s", | |
| 3155 if (my_correlate_substrates) "Correlation" | |
| 3156 else "Covariance", | |
| 3157 kinase_name, | |
| 3158 if (!my_correlate_substrates && | |
| 3159 g_filter_cov_var_gt_1 && | |
| 3160 did_filter_rows | |
| 3161 ) | |
| 3162 "having variance > 1" | |
| 3163 else "" | |
| 3164 ) | |
| 3165 my_main_nchar <- nchar(my_main) | |
| 3166 my_heights <- c( | |
| 3167 0.3, | |
| 3168 my_plot_height, | |
| 3169 my_legend_height # was 4.0 - my_hgt_scale # was 4.19 | |
| 3170 ) | |
| 3171 if (print_trace_messages) cat(sprintf("max_nchar = %0.3f; ", max_nchar)) | |
| 3172 if (print_trace_messages) cat(sprintf("my_margin = %0.3f; ", my_margin)) | |
| 3173 if (print_trace_messages) cat(sprintf("my_plot_height = %0.3f\n\n", my_plot_height)) | |
| 3174 if (print_trace_messages) cat(sprintf("master_cex = %0.3f; ", master_cex)) | |
| 3175 if (print_trace_messages) cat(sprintf("my_row_cex = %0.3f; ", my_row_cex)) | |
| 3176 if (print_trace_messages) cat(sprintf("my_col_cex = %0.3f; ", my_col_cex)) | |
| 3177 if (print_trace_messages) cat(sprintf("my_key_cex = %0.3f\n\n", my_key_cex)) | |
| 3178 if (print_trace_messages) cat(sprintf("my_hgt_scale = %0.3f\n\n", my_hgt_scale)) | |
| 3179 if (print_trace_messages) cat(sprintf("legend height = %0.3f\n\n", my_legend_height)) | |
| 3180 if (print_trace_messages) cat( | |
| 3181 sprintf( | |
| 3182 "my_heights = c(%s); sum = %0.3f\n\n", | |
| 3183 paste( | |
| 3184 sprintf("%0.3f", my_heights), | |
| 3185 collapse = ", " | |
| 3186 ), | |
| 3187 sum(my_heights) | |
| 3188 ) | |
| 3189 ) | |
| 3190 op <- par(cex.main = (30 + my_main_nchar) / my_main_nchar) | |
| 3191 on.exit(par(op)) | |
| 3192 hm2plus( | |
| 3193 x = my_corcov, | |
| 3194 cex = my_hm2_cex, | |
| 3195 cexCol = my_col_cex, | |
| 3196 cexRow = my_row_cex, | |
| 3197 density_info = "none", | |
| 3198 denhgt = 0.15, | |
| 3199 denwid = 0.5, | |
| 3200 divergent = TRUE, | |
| 3201 key_par = list(cex = my_key_cex), | |
| 3202 key_xlab = if (my_correlate_substrates) "Correlation" | |
| 3203 else "Covariance", | |
| 3204 main = my_main, | |
| 3205 mat = matrix( | |
| 3206 c( | |
| 3207 c(0, 3, 3), | |
| 3208 c(2, 1, 1), | |
| 3209 c(0, 4, 0) | |
| 3210 ), | |
| 3211 nrow = 3, | |
| 3212 ncol = 3, | |
| 3213 byrow = TRUE | |
| 3214 ), | |
| 3215 heights = my_heights, | |
| 3216 margins = c(my_margin, my_margin), | |
| 3217 widths = c(0.5, 3.1, 0.9), | |
| 3218 scale = "none", | |
| 3219 symkey = TRUE, | |
| 3220 symbreaks = TRUE, | |
| 3221 symm = FALSE #TODO evaluate TRUE | |
| 3222 # ... | |
| 3223 ) | |
| 3224 } # end cov_heatmap | |
| 3225 | |
| 3226 ### FILE IMPORT | |
| 3227 | |
| 3228 # function reading bzipped file to data.frame | |
| 3229 bzip2df <- function(d, f, ctor = bzfile) { | |
| 3230 # read.delim file (by default, compressed by bzip2) | |
| 3231 if (file.exists(f)) { | |
| 3232 conn <- NULL | |
| 3233 pf <- parent.frame() | |
| 3234 tryCatch( | |
| 3235 assign( | |
| 3236 as.character(substitute(d)), | |
| 3237 read.delim(conn <- bzfile(f, open = "r")), | |
| 3238 pf | |
| 3239 ), | |
| 3240 finally = if (!is.null(conn)) close(conn) | |
| 3241 ) | |
| 3242 } | |
| 3243 } | |
| 3244 | |
| 1203 ``` | 3245 ``` |
| 1204 | 3246 |
| 1205 ```{r, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} | |
| 1206 cat("\\listoftables\n") | |
| 1207 ``` | |
| 1208 # Purpose | 3247 # Purpose |
| 1209 | 3248 |
| 1210 To perform for phosphopeptides: | 3249 The purpose of this analysis is to perform for phosphopeptides: |
| 1211 | 3250 |
| 1212 - imputation of missing values, | 3251 - imputation of missing values, |
| 1213 - quantile normalization, | 3252 - quantile normalization, |
| 1214 - ANOVA (using the R stats::`r params$oneWayManyCategories` function), and | 3253 - ANOVA (using the R stats::`r params$oneWayManyCategories` function), |
| 3254 - assignment of an FDR-adjusted $p$-value and a "quality score" to each phosphopeptide, and | |
| 1215 - KSEA (Kinase-Substrate Enrichment Analysis) using code adapted from the CRAN `KSEAapp` package to search for kinase substrates from the following databases: | 3255 - KSEA (Kinase-Substrate Enrichment Analysis) using code adapted from the CRAN `KSEAapp` package to search for kinase substrates from the following databases: |
| 1216 - PhosphoSitesPlus [https://www.phosphosite.org](https://www.phosphosite.org) | 3256 - PhosphoSitesPlus [https://www.phosphosite.org](https://www.phosphosite.org) |
| 1217 - The Human Proteome Database [http://hprd.org](http://hprd.org) | 3257 - The Human Proteome Database [http://hprd.org](http://hprd.org) |
| 1218 - NetworKIN [http://networkin.science/](http://networkin.science/) | 3258 - NetworKIN [http://networkin.science/](http://networkin.science/) |
| 1219 - Phosida [http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx](http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx) | 3259 - Phosida [http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx](http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx) |
| 1220 | 3260 |
| 1221 ```{r include = FALSE} | 3261 ```{r include = FALSE} |
| 1222 | 3262 |
| 1223 ### GLOBAL VARIABLES | 3263 if (params$kseaUseAbsoluteLog2FC) { |
| 1224 | 3264 sfc <- "|s|" |
| 1225 # parameters for KSEA | 3265 pfc <- "|p|" |
| 1226 | 3266 pfc_txt <- "$\\text{absolute value}({\\log_2 (\\text{fold-change})})$" |
| 1227 ksea_cutoff_statistic <- params$kseaCutoffStatistic | 3267 } else { |
| 1228 ksea_cutoff_threshold <- params$kseaCutoffThreshold | 3268 sfc <- "s" |
| 1229 ksea_min_kinase_count <- params$kseaMinKinaseCount | 3269 pfc <- "p" |
| 3270 pfc_txt <- "${\\log_2 (\\text{fold-change}})$" | |
| 3271 } | |
| 1230 | 3272 |
| 1231 ksea_heatmap_titles <- list() | 3273 ksea_heatmap_titles <- list() |
| 1232 ksea_heatmap_titles[[const_ksea_astrsk_kinases]] <- | 3274 ksea_heatmap_titles[[const_ksea_astrsk_kinases]] <- |
| 1233 sprintf( | 3275 sprintf( |
| 1234 "Summary for all kinases enriched in one or more contrasts at %s < %s", | 3276 "Summary for all kinases enriched in one or more contrasts at %s < %s", |
| 1244 ksea_cutoff_threshold | 3286 ksea_cutoff_threshold |
| 1245 ) | 3287 ) |
| 1246 # hash to hold names of significantly enriched kinases | 3288 # hash to hold names of significantly enriched kinases |
| 1247 ksea_asterisk_hash <- new_env() | 3289 ksea_asterisk_hash <- new_env() |
| 1248 | 3290 |
| 1249 # READ PARAMETERS (mostly) | 3291 # PROCESS (mostly read) PARAMETERS |
| 1250 | |
| 1251 intensity_hm_rows <- params$intensityHeatmapRows | |
| 1252 # Input Filename | |
| 1253 input_file <- params$inputFile | |
| 1254 | |
| 1255 # First data column - ideally, this could be detected via regexSampleNames, | |
| 1256 # but for now leave it as is. | |
| 1257 first_data_column <- params$firstDataColumn | |
| 1258 fdc_is_integer <- is.integer(first_data_column) | |
| 1259 if (fdc_is_integer) { | |
| 1260 first_data_column <- as.integer(params$firstDataColumn) | |
| 1261 } | |
| 1262 | 3292 |
| 1263 # False discovery rate adjustment for ANOVA | 3293 # False discovery rate adjustment for ANOVA |
| 1264 # Since pY abundance is low, set to 0.10 and 0.20 in addition to 0.05 | 3294 # Since pY abundance is low, set to 0.10 and 0.20 in addition to 0.05 |
| 1265 val_fdr <- | 3295 val_fdr <- read.table(file = alpha_file, sep = "\t", header = FALSE, quote = "") |
| 1266 read.table(file = params$alphaFile, sep = "\t", header = FALSE, quote = "") | |
| 1267 | 3296 |
| 1268 if ( | 3297 if ( |
| 1269 ncol(val_fdr) != 1 || | 3298 ncol(val_fdr) != 1 || |
| 1270 sum(!is.numeric(val_fdr[, 1])) || | 3299 sum(!is.numeric(val_fdr[, 1])) || |
| 1271 sum(val_fdr[, 1] < 0) || | 3300 sum(val_fdr[, 1] < 0) || |
| 1272 sum(val_fdr[, 1] > 1) | 3301 sum(val_fdr[, 1] > 1) |
| 1273 ) { | 3302 ) { |
| 1274 stop("alphaFile should be one column of numbers within the range [0.0,1.0]") | 3303 cat("alphaFile should be one column of numbers within the range [0.0,1.0]") |
| 3304 param_df_exit() | |
| 3305 knitr::knit_exit() | |
| 1275 } | 3306 } |
| 1276 val_fdr <- val_fdr[, 1] | 3307 val_fdr <- val_fdr[, 1] |
| 1277 | 3308 |
| 1278 #Imputed Data filename | |
| 1279 imputed_data_filename <- params$imputedDataFilename | |
| 1280 imp_qn_lt_data_filenm <- params$imputedQNLTDataFile | |
| 1281 anova_ksea_mtdt_file <- params$anovaKseaMetadata | |
| 1282 | |
| 1283 ``` | 3309 ``` |
| 1284 | 3310 |
| 1285 ```{r echo = FALSE} | 3311 ```{r echo = FALSE, results = 'asis'} |
| 1286 # Imputation method, should be one of | 3312 |
| 1287 # "random", "group-median", "median", or "mean" | 3313 |
| 1288 imputation_method <- params$imputationMethod | |
| 1289 | |
| 1290 # Selection of percentile of logvalue data to set the mean for random number | |
| 1291 # generation when using random imputation | |
| 1292 mean_percentile <- params$meanPercentile / 100.0 | |
| 1293 | |
| 1294 # deviation adjustment-factor for random values; real number. | |
| 1295 sd_percentile <- params$sdPercentile | |
| 1296 | |
| 1297 # Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$" | |
| 1298 regex_sample_names <- params$regexSampleNames | |
| 1299 | |
| 1300 # Regular expression to extract Sample Grouping from Sample Name; | |
| 1301 # if error occurs, compare sample_treatment_levels vs. sample_name_matches | |
| 1302 # to see if groupings/pairs line up | |
| 1303 # e.g., "(\\d+)" | |
| 1304 regex_sample_grouping <- params$regexSampleGrouping | |
| 1305 | |
| 1306 one_way_all_categories_fname <- params$oneWayManyCategories | |
| 1307 one_way_all_categories <- try_catch_w_e( | |
| 1308 match.fun(one_way_all_categories_fname)) | |
| 1309 if (!is.function(one_way_all_categories$value)) { | |
| 1310 write("fatal error for parameter oneWayManyCategories:", stderr()) | |
| 1311 write(one_way_all_categories$value$message, stderr()) | |
| 1312 if (sys.nframe() > 0) quit(save = "no", status = 1) | |
| 1313 stop("Cannot continue. Goodbye.") | |
| 1314 } | |
| 1315 one_way_all_categories <- one_way_all_categories$value | |
| 1316 | |
| 1317 one_way_two_categories_fname <- params$oneWayManyCategories | |
| 1318 one_way_two_categories <- try_catch_w_e( | |
| 1319 match.fun(one_way_two_categories_fname)) | |
| 1320 if (!is.function(one_way_two_categories$value)) { | |
| 1321 cat("fatal error for parameter oneWayTwoCategories: \n") | |
| 1322 cat(one_way_two_categories$value$message, fill = TRUE) | |
| 1323 if (sys.nframe() > 0) quit(save = "no", status = 1) | |
| 1324 stop("Cannot continue. Goodbye.") | |
| 1325 } | |
| 1326 one_way_two_categories <- one_way_two_categories$value | |
| 1327 | |
| 1328 preproc_db <- params$preprocDb | |
| 1329 ksea_app_prep_db <- params$kseaAppPrepDb | |
| 1330 result <- file.copy( | 3314 result <- file.copy( |
| 1331 from = preproc_db, | 3315 from = preproc_db, |
| 1332 to = ksea_app_prep_db, | 3316 to = ksea_app_prep_db, |
| 1333 overwrite = TRUE | 3317 overwrite = TRUE |
| 1334 ) | 3318 ) |
| 1339 preproc_db, | 3323 preproc_db, |
| 1340 ksea_app_prep_db, | 3324 ksea_app_prep_db, |
| 1341 ), | 3325 ), |
| 1342 stderr() | 3326 stderr() |
| 1343 ) | 3327 ) |
| 1344 if (sys.nframe() > 0) quit(save = "no", status = 1) | 3328 if (sys.nframe() > 0) { |
| 1345 stop("Cannot continue. Goodbye.") | 3329 cat("Cannot continue and quit() failed. Goodbye.") |
| 1346 } | 3330 param_df_exit() |
| 3331 knitr::knit_exit() | |
| 3332 # in case knit_exit doesn't exit | |
| 3333 quit(save = "no", status = 1) | |
| 3334 } | |
| 3335 } | |
| 3336 | |
| 3337 if (FALSE) { | |
| 3338 write.table(x = param_df, file = "test-data/params.txt") | |
| 3339 } | |
| 3340 | |
| 1347 ``` | 3341 ``` |
| 1348 | 3342 |
| 1349 ```{r echo = FALSE} | 3343 ```{r echo = FALSE} |
| 1350 ### READ DATA | 3344 ### READ DATA |
| 1351 | 3345 |
| 1356 sep = "\t", | 3350 sep = "\t", |
| 1357 header = TRUE, | 3351 header = TRUE, |
| 1358 quote = "", | 3352 quote = "", |
| 1359 check.names = FALSE | 3353 check.names = FALSE |
| 1360 ) | 3354 ) |
| 3355 | |
| 1361 ``` | 3356 ``` |
| 1362 | 3357 |
| 1363 # Extract Sample Classes and Names | 3358 |
| 1364 | 3359 # Extraction of Sample Classes and Names from Input Data |
| 1365 Column names parsed from input file are shown in Table 1; sample classes and names, in Table 2. | |
| 1366 | 3360 |
| 1367 ```{r echo = FALSE, results = 'asis'} | 3361 ```{r echo = FALSE, results = 'asis'} |
| 1368 | 3362 |
| 1369 data_column_indices <- grep(first_data_column, names(full_data), perl = TRUE) | 3363 data_column_indices <- grep(first_data_column, names(full_data), perl = TRUE) |
| 3364 my_column_names <- names(full_data) | |
| 1370 | 3365 |
| 1371 if (!fdc_is_integer) { | 3366 if (!fdc_is_integer) { |
| 1372 if (length(data_column_indices) > 0) { | 3367 if (length(data_column_indices) > 0) { |
| 1373 first_data_column <- data_column_indices[1] | 3368 first_data_column <- data_column_indices[1] |
| 1374 } else { | 3369 } else { |
| 1375 stop(paste("failed to convert firstDataColumn:", first_data_column)) | 3370 cat(paste("failed to convert firstDataColumn:", first_data_column)) |
| 3371 param_df_exit() | |
| 3372 knitr::knit_exit() | |
| 1376 } | 3373 } |
| 1377 } | 3374 } |
| 1378 | 3375 |
| 1379 cat( | 3376 cat( |
| 1380 sprintf( | 3377 sprintf( |
| 1381 paste( | 3378 paste( |
| 1382 "\n\nThe input data file has peptide-intensity data for each sample", | 3379 "\n\nThe input data file has peptide-intensity data", |
| 1383 "in one of columns %d through %d.\n\n" | 3380 "in columns %d (\"%s\") through %d (\"%s\")." |
| 1384 ), | 3381 ), |
| 1385 min(data_column_indices), | 3382 tmp <- min(data_column_indices), |
| 1386 max(data_column_indices) | 3383 my_column_names[tmp], |
| 1387 ) | 3384 tmp <- max(data_column_indices), |
| 1388 ) | 3385 my_column_names[tmp] |
| 1389 | 3386 ) |
| 1390 # Write column names as a LaTeX enumerated list. | 3387 ) |
| 1391 column_name_df <- data.frame( | 3388 |
| 1392 column = seq_len(length(colnames(full_data))), | 3389 if (TRUE) { |
| 1393 name = paste0("\\verb@", colnames(full_data), "@") | 3390 cat0( |
| 1394 ) | 3391 table_offset(i = 1, new = TRUE), |
| 1395 cat("\n\\begin{tiny}\n") | 3392 "Sample classes and names are shown in ", |
| 1396 data_frame_latex( | 3393 table_href(), |
| 1397 x = column_name_df, | 3394 ".\n\n" |
| 1398 justification = "l l", | 3395 ) |
| 1399 centered = TRUE, | 3396 } else { |
| 1400 caption = "Input data column names", | 3397 cat0( |
| 1401 anchor = const_table_anchor_bp, | 3398 "\\newcounter{offset}\n", |
| 1402 underscore_whack = FALSE | 3399 "\\setcounter{offset}{\\value{table}}\n", |
| 1403 ) | 3400 "\\stepcounter{offset}\n", |
| 1404 cat("\n\\end{tiny}\n") | 3401 "Sample classes and names are shown in ", |
| 3402 table_href(), | |
| 3403 ".\n\n" | |
| 3404 ) | |
| 3405 } | |
| 3406 | |
| 3407 #TODO remove this unused variable and assignment | |
| 3408 if (FALSE) { | |
| 3409 # Write column names as a LaTeX enumerated list. | |
| 3410 column_name_df <- data.frame( | |
| 3411 column = seq_len(length(colnames(full_data))), | |
| 3412 name = paste0("\\verb@", colnames(full_data), "@") | |
| 3413 ) | |
| 3414 } | |
| 1405 | 3415 |
| 1406 ``` | 3416 ``` |
| 1407 | 3417 |
| 1408 ```{r echo = FALSE, results = 'asis'} | 3418 ```{r echo = FALSE, results = 'asis'} |
| 3419 # extract intensity columns from full_data to quant_data | |
| 1409 quant_data <- full_data[first_data_column:length(full_data)] | 3420 quant_data <- full_data[first_data_column:length(full_data)] |
| 1410 quant_data[quant_data == 0] <- NA | 3421 quant_data[quant_data == 0] <- NA |
| 1411 rownames(quant_data) <- rownames(full_data) <- full_data$Phosphopeptide | 3422 rownames(quant_data) <- rownames(full_data) <- full_data$Phosphopeptide |
| 3423 full_data_names <- colnames(quant_data) | |
| 1412 # Extract factors and trt-replicates using regular expressions. | 3424 # Extract factors and trt-replicates using regular expressions. |
| 1413 # Typically: | 3425 # Typically: |
| 1414 # regex_sample_names is "\\.\\d+[A-Z]$" | 3426 # regex_sample_names is "\\.\\d+[A-Z]$" |
| 1415 # regex_sample_grouping is "\\d+" | 3427 # regex_sample_grouping is "\\d+" |
| 1416 # This would distinguish trt-replicates by terminal letter [A-Z] | 3428 # This would distinguish trt-replicates by terminal letter [A-Z] |
| 1424 colnames(quant_data) <- sample_name_matches | 3436 colnames(quant_data) <- sample_name_matches |
| 1425 | 3437 |
| 1426 write_debug_file(quant_data) | 3438 write_debug_file(quant_data) |
| 1427 | 3439 |
| 1428 rx_match <- regexpr(regex_sample_grouping, sample_name_matches, perl = TRUE) | 3440 rx_match <- regexpr(regex_sample_grouping, sample_name_matches, perl = TRUE) |
| 1429 sample_treatment_levels <- as.factor(regmatches(sample_name_matches, rx_match)) | 3441 smpl_trt <- as.factor(regmatches(sample_name_matches, rx_match)) |
| 3442 | |
| 3443 if (print_nb_messages) nbe(see_variable(smpl_trt, "\n\n")) | |
| 3444 if (print_nb_messages) nbe(see_variable(sample_name_matches, "\n\n")) | |
| 3445 if (print_nb_messages) nbe(see_variable(full_data_names, "\n\n")) | |
| 3446 | |
| 3447 sample_treatment_df <- | |
| 3448 save_sample_treatment_df <- | |
| 3449 data.frame( | |
| 3450 class = smpl_trt, | |
| 3451 sample = sample_name_matches, | |
| 3452 full_sample_names = full_data_names | |
| 3453 ) | |
| 3454 | |
| 3455 if (print_nb_messages) nbe(see_variable(sample_treatment_df, "\n\n")) | |
| 3456 | |
| 3457 # reorder data | |
| 3458 my_order <- with(sample_treatment_df, order(class, sample)) | |
| 3459 quant_data <- quant_data[, my_order] | |
| 3460 sample_name_matches <- sample_name_matches[my_order] | |
| 3461 smpl_trt <- smpl_trt[my_order] | |
| 3462 sample_treatment_df <- data.frame( | |
| 3463 class = smpl_trt, | |
| 3464 sample = sample_name_matches | |
| 3465 ) | |
| 3466 | |
| 3467 # filter smpl_trt as appropriate | |
| 3468 if (sample_group_filter %in% c("include", "exclude")) { | |
| 3469 include_sample <- | |
| 3470 mgrepl( | |
| 3471 v = sample_group_filter_patterns, | |
| 3472 s = as.character(smpl_trt), | |
| 3473 fixed = sample_group_filter_fixed, | |
| 3474 perl = sample_group_filter_perl, | |
| 3475 ignore.case = sample_group_filter_nocase | |
| 3476 ) | |
| 3477 if (sum(include_sample) < 2) { | |
| 3478 errmsg <- | |
| 3479 paste( | |
| 3480 "ERROR:", | |
| 3481 sum(include_sample), | |
| 3482 "samples are too few for analysis;", | |
| 3483 "check input parameters for sample-name parsing" | |
| 3484 ) | |
| 3485 cat0( | |
| 3486 errmsg, | |
| 3487 "\\stepcounter{offset}\n", | |
| 3488 " in ", | |
| 3489 table_href(), | |
| 3490 ".\n\n" | |
| 3491 ) | |
| 3492 data_frame_tabbing_latex( | |
| 3493 x = save_sample_treatment_df, | |
| 3494 tabstops = c(1.25, 1.25), | |
| 3495 caption = "Sample classes", | |
| 3496 use_subsubsection_header = FALSE | |
| 3497 ) | |
| 3498 data_frame_tabbing_latex( | |
| 3499 x = | |
| 3500 param_df[ | |
| 3501 c("regexSampleNames", | |
| 3502 "regexSampleGrouping", | |
| 3503 "groupFilterPatterns", | |
| 3504 "groupFilter", | |
| 3505 "groupFilterMode" | |
| 3506 ), | |
| 3507 ], | |
| 3508 tabstops = c(1.75), | |
| 3509 underscore_whack = TRUE, | |
| 3510 caption = "Input parameters for sample-name parsing", | |
| 3511 verbatim = FALSE | |
| 3512 ) | |
| 3513 param_df_exit() | |
| 3514 knitr::knit_exit() | |
| 3515 return(invisible(-1)) | |
| 3516 } | |
| 3517 sample_treatment_df <- | |
| 3518 if (sample_group_filter == "include") | |
| 3519 sample_treatment_df[include_sample, ] | |
| 3520 else | |
| 3521 sample_treatment_df[!include_sample, ] | |
| 3522 } else { | |
| 3523 include_sample <- rep.int(TRUE, length(smpl_trt)) | |
| 3524 } | |
| 3525 sample_name_matches <- sample_treatment_df$sample | |
| 3526 rx_match <- regexpr(regex_sample_grouping, sample_name_matches, perl = TRUE) | |
| 3527 smpl_trt <- as.factor(regmatches(sample_name_matches, rx_match)) | |
| 3528 sample_treatment_df$class <- smpl_trt | |
| 3529 | |
| 3530 # filter quant_data as appropriate | |
| 1430 number_of_samples <- length(sample_name_matches) | 3531 number_of_samples <- length(sample_name_matches) |
| 1431 sample_treatment_df <- data.frame( | 3532 quant_data <- quant_data[, sample_name_matches] |
| 1432 class = sample_treatment_levels, | 3533 |
| 1433 sample = sample_name_matches | 3534 sample_level_integers <- as.integer(smpl_trt) |
| 1434 ) | 3535 sample_treatment_levels <- levels(smpl_trt) |
| 1435 # reorder data | 3536 count_of_treatment_levels <- length(sample_treatment_levels) |
| 1436 if (TRUE) { | 3537 |
| 1437 my_order <- with(sample_treatment_df, order(class, sample)) | 3538 # for each phosphopeptide, across treatment levels, compute minimum |
| 1438 quant_data <- quant_data[, my_order] | 3539 # count of observed (i.e., non-missing) values |
| 1439 sample_name_matches <- sample_name_matches[my_order] | 3540 my_env <- new_env() |
| 1440 sample_treatment_levels <- sample_treatment_levels[my_order] | 3541 for (l in sample_treatment_levels) |
| 1441 } | 3542 my_env[[as.character(l)]] <- |
| 1442 sample_treatment_df <- data.frame( | 3543 as.vector(rowSums(!is.na(quant_data[l == smpl_trt]))) |
| 1443 class = sample_treatment_levels, | 3544 min_group_obs_count <- row_apply( |
| 1444 sample = sample_name_matches | 3545 x = Reduce( |
| 1445 ) | 3546 f = function(l, r) cbind(l, my_env[[r]]), |
| 1446 data_frame_latex( | 3547 x = sample_treatment_levels, |
| 3548 init = c() | |
| 3549 ), | |
| 3550 fun = min | |
| 3551 ) | |
| 3552 names(min_group_obs_count) <- rownames(quant_data) | |
| 3553 rm(my_env) | |
| 3554 | |
| 3555 # display (possibly-filtered) results | |
| 3556 cat("\\newpage\n") | |
| 3557 | |
| 3558 if (sum(include_sample) > 1) { | |
| 3559 data_frame_tabbing_latex( | |
| 1447 x = sample_treatment_df, | 3560 x = sample_treatment_df, |
| 1448 justification = "rp{0.2\\linewidth} lp{0.3\\linewidth}", | 3561 tabstops = c(1.25), |
| 1449 centered = TRUE, | |
| 1450 caption = "Sample classes", | 3562 caption = "Sample classes", |
| 1451 anchor = const_table_anchor_tbp, | 3563 use_subsubsection_header = FALSE |
| 1452 underscore_whack = FALSE | 3564 ) |
| 1453 ) | 3565 } |
| 1454 sample_name_shrink <- 10 / (10 + max(nchar(sample_name_matches))) | 3566 sample_name_grow <- 10 / (10 + max(nchar(sample_name_matches) + 6)) |
| 3567 sample_colsep <- transition_positions(as.integer(sample_treatment_df$class)) | |
| 1455 ``` | 3568 ``` |
| 1456 | 3569 |
| 1457 ```{r echo = FALSE, results = 'asis'} | 3570 ```{r echo = FALSE, results = 'asis'} |
| 1458 cat("\\newpage\n") | 3571 cat("\\newpage\n") |
| 1459 ``` | 3572 ``` |
| 1460 | 3573 |
| 1461 ## Are the log-transformed sample distributions similar? | 3574 ## Are the log-transformed sample distributions similar? |
| 1462 | 3575 |
| 1463 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} | 3576 ```{r echo = FALSE, fig.dim = c(9, 6.5), results = 'asis'} |
| 1464 | 3577 |
| 1465 quant_data[quant_data == 0] <- NA #replace 0 with NA | 3578 quant_data[quant_data == 0] <- NA #replace 0 with NA |
| 1466 quant_data_log <- log10(quant_data) | 3579 quant_data_log <- log10(quant_data) |
| 1467 | 3580 |
| 1468 rownames(quant_data_log) <- rownames(quant_data) | 3581 rownames(quant_data_log) <- rownames(quant_data) |
| 1469 colnames(quant_data_log) <- sample_name_matches | 3582 colnames(quant_data_log) <- sample_name_matches |
| 1470 | 3583 |
| 1471 write_debug_file(quant_data_log) | 3584 write_debug_file(quant_data_log) |
| 1472 | 3585 |
| 1473 # data visualization | 3586 g_ppep_distrib_ctl <- new_env() |
| 1474 old_par <- par( | 3587 g_ppep_distrib_ctl$xlab_line <- 3.5 + 11.86 * (0.67 - sample_name_grow) |
| 1475 mai = par("mai") + c(0.5, 0, 0, 0) | 3588 g_ppep_distrib_ctl$mai_bottom <- (0.5 + 3.95 * (0.67 - sample_name_grow)) |
| 1476 ) | 3589 g_ppep_distrib_ctl$axis <- (0.6 + 0.925 * (0.67 - sample_name_grow)) |
| 1477 # ref: https://r-charts.com/distribution/add-points-boxplot/ | 3590 |
| 1478 # Vertical plot | 3591 my_ppep_distrib_bxp <- function( |
| 1479 boxplot( | 3592 x |
| 1480 quant_data_log | 3593 , sample_name_grow = sample_name_grow |
| 1481 , las = 2 | 3594 , main |
| 1482 , cex.axis = 0.9 * sample_name_shrink | 3595 , varwidth = FALSE |
| 1483 , col = const_boxplot_fill | 3596 , sub = NULL |
| 1484 , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") | 3597 , xlab |
| 1485 , xlab = "Sample" | 3598 , ylab |
| 1486 ) | 3599 , col = const_boxplot_fill |
| 1487 par(old_par) | 3600 , notch = FALSE |
| 1488 | 3601 , ppep_distrib_ctl = g_ppep_distrib_ctl |
| 1489 | 3602 , ... |
| 1490 | 3603 ) { |
| 1491 cat("\n\n\n") | 3604 my_xlab_line <- g_ppep_distrib_ctl$xlab_line |
| 1492 cat("\n\n\n") | 3605 my_mai_bottom <- g_ppep_distrib_ctl$mai_bottom |
| 3606 my_axis <- g_ppep_distrib_ctl$axis | |
| 3607 | |
| 3608 if (print_trace_messages) { | |
| 3609 cat_variable(my_xlab_line, suffix = "; ") | |
| 3610 cat_variable(my_mai_bottom, suffix = "; ") | |
| 3611 cat_variable(my_axis, suffix = "\n\n") | |
| 3612 } | |
| 3613 | |
| 3614 old_par <- par( | |
| 3615 mai = par("mai") + c(my_mai_bottom, 0, 0, 0), | |
| 3616 cex.axis = my_axis, | |
| 3617 cex.lab = 1.2 | |
| 3618 ) | |
| 3619 tryCatch( | |
| 3620 { | |
| 3621 # Vertical plot | |
| 3622 boxplot( | |
| 3623 x | |
| 3624 , las = 2 | |
| 3625 , col = col | |
| 3626 , main = main | |
| 3627 , sub = NULL | |
| 3628 , ylab = ylab | |
| 3629 , xlab = NULL | |
| 3630 , notch = notch | |
| 3631 , varwidth = varwidth | |
| 3632 , ... | |
| 3633 ) | |
| 3634 title( | |
| 3635 sub = sub | |
| 3636 , cex.sub = 1.0 | |
| 3637 , line = my_xlab_line + 1 | |
| 3638 ) | |
| 3639 title( | |
| 3640 xlab = xlab | |
| 3641 , line = my_xlab_line | |
| 3642 ) | |
| 3643 }, | |
| 3644 finally = par(old_par) | |
| 3645 ) | |
| 3646 } | |
| 3647 | |
| 3648 my_ppep_distrib_bxp( | |
| 3649 x = quant_data_log | |
| 3650 , sample_name_grow = sample_name_grow | |
| 3651 , main = "Peptide intensities for each sample" | |
| 3652 , varwidth = boxplot_varwidth | |
| 3653 , sub = "Box widths reflect number of peptides for sample" | |
| 3654 , xlab = "Sample" | |
| 3655 , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") | |
| 3656 , col = const_boxplot_fill | |
| 3657 , notch = FALSE | |
| 3658 ) | |
| 3659 | |
| 3660 cat("\n\n\n\n") | |
| 1493 | 3661 |
| 1494 ``` | 3662 ``` |
| 1495 | 3663 |
| 1496 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'} | 3664 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'} |
| 1497 if (nrow(quant_data_log) > 1) { | 3665 if (nrow(quant_data_log) > 1) { |
| 1536 main = latex2exp::TeX("Frequency vs. $log_{10}$(peptide intensity)"), | 3704 main = latex2exp::TeX("Frequency vs. $log_{10}$(peptide intensity)"), |
| 1537 xlab = latex2exp::TeX("$log_{10}$(peptide intensity)") | 3705 xlab = latex2exp::TeX("$log_{10}$(peptide intensity)") |
| 1538 ) | 3706 ) |
| 1539 ``` | 3707 ``` |
| 1540 | 3708 |
| 3709 # Characterization of Input Data | |
| 3710 | |
| 1541 ## Distribution of standard deviations of $log_{10}(\text{intensity})$, ignoring missing values | 3711 ## Distribution of standard deviations of $log_{10}(\text{intensity})$, ignoring missing values |
| 1542 | 3712 |
| 1543 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5), results = 'asis'} | 3713 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5), results = 'asis'} |
| 1544 # determine quantile | 3714 # determine quantile |
| 1545 q1 <- quantile(logvalues, probs = mean_percentile)[1] | 3715 q1 <- quantile(logvalues, probs = mean_percentile)[1] |
| 1546 | 3716 |
| 1547 # 1 = row of matrix (ie, phosphopeptide) | 3717 # 1 = row of matrix (ie, phosphopeptide) |
| 1548 sds <- apply(quant_data_log, 1, sd_finite) | 3718 sds <- row_apply(quant_data_log, sd_finite) |
| 1549 if (sum(!is.na(sds)) > 2) { | 3719 if (sum(!is.na(sds)) > 2) { |
| 1550 plot( | 3720 plot( |
| 1551 density(sds, na.rm = TRUE) | 3721 density(sds, na.rm = TRUE) |
| 1552 , main = "Smoothed estimated probability density vs. std. deviation" | 3722 , main = "Smoothed estimated probability density vs. std. deviation" |
| 1553 , sub = "(probability estimation made with Gaussian smoothing)" | 3723 , sub = "(probability estimation made with Gaussian smoothing)" |
| 1577 ```{r echo = FALSE} | 3747 ```{r echo = FALSE} |
| 1578 | 3748 |
| 1579 # prep for trt-median based imputation | 3749 # prep for trt-median based imputation |
| 1580 | 3750 |
| 1581 ``` | 3751 ``` |
| 1582 # Impute Missing Values | 3752 # Imputation of Missing Values |
| 1583 | 3753 |
| 1584 ```{r echo = FALSE} | 3754 ```{r echo = FALSE} |
| 1585 | 3755 |
| 1586 imp_smry_pot_peptides_before <- nrow(quant_data_log) | 3756 imp_smry_pot_peptides_before <- nrow(quant_data_log) |
| 1587 imp_smry_missing_values_before <- number_to_impute | 3757 imp_smry_missing_values_before <- number_to_impute |
| 1608 quant_data_imp <- quant_data | 3778 quant_data_imp <- quant_data |
| 1609 imputation_method_description <- | 3779 imputation_method_description <- |
| 1610 paste("Substitute missing value with", | 3780 paste("Substitute missing value with", |
| 1611 "median peptide-intensity for sample group.\n" | 3781 "median peptide-intensity for sample group.\n" |
| 1612 ) | 3782 ) |
| 1613 sample_level_integers <- as.integer(sample_treatment_levels) | |
| 1614 # Take the accurate ln(x+1) because the data are log-normally distributed | 3783 # Take the accurate ln(x+1) because the data are log-normally distributed |
| 1615 # and because median can involve an average of two measurements. | 3784 # and because median can involve an average of two measurements. |
| 1616 quant_data_imp <- log1p(quant_data_imp) | 3785 quant_data_imp <- log1p(quant_data_imp) |
| 1617 for (i in seq_len(length(levels(sample_treatment_levels)))) { | 3786 for (i in seq_len(count_of_treatment_levels)) { |
| 1618 # Determine the columns for this factor-level | 3787 # Determine the columns for this factor-level |
| 1619 level_cols <- i == sample_level_integers | 3788 level_cols <- i == sample_level_integers |
| 1620 # Extract those columns | 3789 # Extract those columns |
| 1621 lvlsbst <- quant_data_imp[, level_cols, drop = FALSE] | 3790 lvlsbst <- quant_data_imp[, level_cols, drop = FALSE] |
| 1622 # assign to ind the row-column pairs corresponding to each NA | 3791 # assign to ind the row-column pairs corresponding to each NA |
| 1623 ind <- which(is.na(lvlsbst), arr.ind = TRUE) | 3792 ind <- which(is.na(lvlsbst), arr.ind = TRUE) |
| 1624 # No group-median exists if there is only one sample | 3793 # No group-median exists if there is only one sample |
| 1625 # a given ppep has no measurement; otherwise, proceed. | 3794 # a given ppep has no measurement; otherwise, proceed. |
| 1626 if (ncol(lvlsbst) > 1) { | 3795 if (ncol(lvlsbst) > 1) { |
| 1627 the_centers <- | 3796 the_centers <- |
| 1628 apply(lvlsbst, 1, median, na.rm = TRUE) | 3797 row_apply(lvlsbst, median, na.rm = TRUE) |
| 1629 for (j in seq_len(nrow(lvlsbst))) { | 3798 for (j in seq_len(nrow(lvlsbst))) { |
| 1630 for (k in seq_len(ncol(lvlsbst))) { | 3799 for (k in seq_len(ncol(lvlsbst))) { |
| 1631 if (is.na(lvlsbst[j, k])) { | 3800 if (is.na(lvlsbst[j, k])) { |
| 1632 lvlsbst[j, k] <- the_centers[j] | 3801 lvlsbst[j, k] <- the_centers[j] |
| 1633 } | 3802 } |
| 1647 "median peptide-intensity across all sample classes.\n" | 3816 "median peptide-intensity across all sample classes.\n" |
| 1648 ) | 3817 ) |
| 1649 # Take the accurate ln(x+1) because the data are log-normally distributed | 3818 # Take the accurate ln(x+1) because the data are log-normally distributed |
| 1650 # and because median can involve an average of two measurements. | 3819 # and because median can involve an average of two measurements. |
| 1651 quant_data_imp <- log1p(quant_data_imp) | 3820 quant_data_imp <- log1p(quant_data_imp) |
| 1652 quant_data_imp[ind] <- apply(quant_data_imp, 1, median, na.rm = TRUE)[ind[, 1]] | 3821 quant_data_imp[ind] <- row_apply(quant_data_imp, median, na.rm = TRUE)[ind[, 1]] |
| 1653 # Take the accurate e^x - 1 to match scaling of original input. | 3822 # Take the accurate e^x - 1 to match scaling of original input. |
| 1654 quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp)) | 3823 quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp)) |
| 1655 good_rows <- !is.nan(rowMeans(quant_data_imp)) | 3824 good_rows <- !is.nan(rowMeans(quant_data_imp)) |
| 1656 } | 3825 } |
| 1657 , "mean" = { | 3826 , "mean" = { |
| 1663 # Take the accurate ln(x+1) because the data are log-normally distributed, | 3832 # Take the accurate ln(x+1) because the data are log-normally distributed, |
| 1664 # so arguments to mean should be previously transformed. | 3833 # so arguments to mean should be previously transformed. |
| 1665 # this will have to be | 3834 # this will have to be |
| 1666 quant_data_imp <- log1p(quant_data_imp) | 3835 quant_data_imp <- log1p(quant_data_imp) |
| 1667 # Assign to NA cells the mean for the row | 3836 # Assign to NA cells the mean for the row |
| 1668 quant_data_imp[ind] <- apply(quant_data_imp, 1, mean, na.rm = TRUE)[ind[, 1]] | 3837 quant_data_imp[ind] <- row_apply(quant_data_imp, mean, na.rm = TRUE)[ind[, 1]] |
| 1669 # Take the accurate e^x - 1 to match scaling of original input. | 3838 # Take the accurate e^x - 1 to match scaling of original input. |
| 1670 quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp)) | 3839 quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp)) |
| 1671 good_rows <- !is.nan(rowMeans(quant_data_imp)) | 3840 good_rows <- !is.nan(rowMeans(quant_data_imp)) |
| 1672 } | 3841 } |
| 1673 , "random" = { | 3842 , "random" = { |
| 1707 ```{r echo = FALSE} | 3876 ```{r echo = FALSE} |
| 1708 | 3877 |
| 1709 imp_smry_pot_peptides_after <- sum(good_rows) | 3878 imp_smry_pot_peptides_after <- sum(good_rows) |
| 1710 imp_smry_rejected_after <- sum(!good_rows) | 3879 imp_smry_rejected_after <- sum(!good_rows) |
| 1711 imp_smry_missing_values_after <- sum(is.na(quant_data_imp[good_rows, ])) | 3880 imp_smry_missing_values_after <- sum(is.na(quant_data_imp[good_rows, ])) |
| 3881 | |
| 3882 # From ?`%in%`, %in% is currently defined as function(x, table) match(x, table, nomatch = 0) > 0 | |
| 3883 | |
| 3884 sink(stderr()) | |
| 3885 print("`%in%`:") | |
| 3886 print(`%in%`) | |
| 3887 sink() | |
| 3888 | |
| 3889 stock_in <- | |
| 3890 names(good_rows) %in% | |
| 3891 names(min_group_obs_count[g_intensity_min_per_class <= min_group_obs_count]) | |
| 3892 if (print_nb_messages) nbe(see_variable(stock_in), "\n") | |
| 3893 | |
| 3894 explicit_in <- | |
| 3895 0 < match( | |
| 3896 names(good_rows), | |
| 3897 names(min_group_obs_count[g_intensity_min_per_class <= min_group_obs_count]) | |
| 3898 ) | |
| 3899 if (print_nb_messages) nbe(see_variable(explicit_in), "\n") | |
| 3900 | |
| 3901 great_enough_row_names <- good_rows[ | |
| 3902 names(good_rows) %in% | |
| 3903 names(min_group_obs_count[g_intensity_min_per_class <= min_group_obs_count]) | |
| 3904 ] | |
| 3905 if (print_nb_messages) nbe(see_variable(great_enough_row_names), "\n") | |
| 3906 great_enough_row_names <- great_enough_row_names[great_enough_row_names] | |
| 3907 if (print_nb_messages) nbe(see_variable(great_enough_row_names), "\n") | |
| 1712 ``` | 3908 ``` |
| 3909 | |
| 1713 ```{r echo = FALSE, results = 'asis'} | 3910 ```{r echo = FALSE, results = 'asis'} |
| 1714 # ref: http://www1.maths.leeds.ac.uk/latex/TableHelp1.pdf | 3911 # ref: http://www1.maths.leeds.ac.uk/latex/TableHelp1.pdf |
| 1715 tabular_lines_fmt <- paste( | 3912 tabular_lines_fmt <- paste( |
| 1716 "\\begin{table}[hb]", # h(inline); b(bottom); t (top) or p (separate page) | 3913 "\\begin{table}[hb]", # h(inline); b(bottom); t (top) or p (separate page) |
| 3914 " \\leavevmode", | |
| 1717 " \\caption{Imputation Results}", | 3915 " \\caption{Imputation Results}", |
| 1718 " \\centering", # \centering centers the table on the page | 3916 " \\centering", # \centering centers the table on the page |
| 1719 " \\begin{tabular}{l c c c}", | 3917 " \\begin{tabular}{l c c c}", |
| 1720 " \\hline\\hline", | 3918 " \\hline\\hline", |
| 1721 " \\ & potential peptides & missing values & rejected", | 3919 " \\ & potential peptides & missing values & rejected", |
| 1722 " peptides \\\\ [0.5ex]", | 3920 " peptides \\\\ [0.5ex]", |
| 1723 " \\hline", | 3921 " \\hline", |
| 1724 " before imputation & %d & %d (%d\\%s) & \\\\", | 3922 " before imputation & %d & %d (%d\\%s) & \\\\", |
| 1725 " after imputation & %d & %d & %d \\\\ [1ex]", | 3923 " after imputation & %d & %d & %d \\\\", |
| 3924 " after keep comparable & %d & & %d \\\\ [1ex]", | |
| 1726 " \\hline", | 3925 " \\hline", |
| 1727 " \\end{tabular}", | 3926 " \\end{tabular}", |
| 1728 #" \\label{table:nonlin}", # may be used to refer this table in the text | 3927 #" \\label{table:nonlin}", # may be used to refer this table in the text |
| 1729 "\\end{table}", | 3928 "\\end{table}", |
| 1730 sep = "\n" | 3929 sep = "\n" |
| 1736 imp_smry_missing_values_before, | 3935 imp_smry_missing_values_before, |
| 1737 imp_smry_pct_missing, | 3936 imp_smry_pct_missing, |
| 1738 "%", | 3937 "%", |
| 1739 imp_smry_pot_peptides_after, | 3938 imp_smry_pot_peptides_after, |
| 1740 imp_smry_missing_values_after, | 3939 imp_smry_missing_values_after, |
| 1741 imp_smry_rejected_after | 3940 imp_smry_rejected_after, |
| 3941 length(great_enough_row_names), | |
| 3942 imp_smry_pot_peptides_before - | |
| 3943 length(great_enough_row_names) | |
| 1742 ) | 3944 ) |
| 1743 cat(tabular_lines) | 3945 cat(tabular_lines) |
| 1744 ``` | 3946 ``` |
| 1745 ```{r echo = FALSE} | 3947 |
| 1746 | 3948 ```{r filter_good_rows, echo = FALSE} |
| 1747 | 3949 |
| 1748 # Zap rows where imputation was ineffective | 3950 if (print_nb_messages) nbe("before name extraction, ", see_variable(length(good_rows)), " ", see_variable(good_rows), "\n") |
| 3951 good_rows <- names(good_rows[names(great_enough_row_names)]) | |
| 3952 if (print_nb_messages) nbe("after name extraction, ", see_variable(length(good_rows)), see_variable(good_rows), "\n") | |
| 3953 | |
| 3954 #ACE min_group_obs_count <- min_group_obs_count[names(great_enough_row_names)] | |
| 3955 #ACE nbe("good_rows") | |
| 3956 #ACE nbe(see_variable(good_rows)) | |
| 3957 #ACE nbe("names(min_group_obs_count) before filter for good rows") | |
| 3958 #ACE nbe(see_variable(names(min_group_obs_count))) | |
| 3959 min_group_obs_count <- min_group_obs_count[good_rows] | |
| 3960 #ACE nbe("min_group_obs_count after filter for good rows") | |
| 3961 #ACE nbe(see_variable(names(min_group_obs_count))) | |
| 3962 | |
| 3963 # Zap rows where imputation was insufficiently effective | |
| 1749 full_data <- full_data [good_rows, ] | 3964 full_data <- full_data [good_rows, ] |
| 1750 quant_data <- quant_data [good_rows, ] | 3965 quant_data <- quant_data [good_rows, ] |
| 1751 | 3966 quant_data_log <- quant_data_log [good_rows, ] |
| 3967 | |
| 3968 if (print_nb_messages) nbe("before row filter, ", see_variable(nrow(quant_data_imp)), "\n") | |
| 1752 quant_data_imp <- quant_data_imp[good_rows, ] | 3969 quant_data_imp <- quant_data_imp[good_rows, ] |
| 3970 if (print_nb_messages) nbe("after row filter, ", see_variable(nrow(quant_data_imp)), "\n") | |
| 1753 write_debug_file(quant_data_imp) | 3971 write_debug_file(quant_data_imp) |
| 1754 quant_data_imp_good_rows <- quant_data_imp | 3972 quant_data_imp_good_rows <- quant_data_imp |
| 1755 | 3973 |
| 1756 write_debug_file(quant_data_imp_good_rows) | 3974 write_debug_file(quant_data_imp_good_rows) |
| 1757 ``` | 3975 ``` |
| 1801 d_imputed <- d_combined | 4019 d_imputed <- d_combined |
| 1802 } | 4020 } |
| 1803 | 4021 |
| 1804 ``` | 4022 ``` |
| 1805 | 4023 |
| 1806 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} | 4024 ```{r echo = FALSE, fig.dim = c(9, 6.5), results = 'asis'} |
| 1807 zero_sd_rownames <- | 4025 zero_sd_rownames <- |
| 1808 rownames(quant_data_imp)[ | 4026 rownames(quant_data_imp)[ |
| 1809 is.na((apply(quant_data_imp, 1, sd, na.rm = TRUE)) == 0) | 4027 is.na((row_apply(quant_data_imp, sd, na.rm = TRUE)) == 0) |
| 1810 ] | 4028 ] |
| 1811 | 4029 |
| 1812 if (length(zero_sd_rownames) >= nrow(quant_data_imp)) { | 4030 if (length(zero_sd_rownames) >= nrow(quant_data_imp)) { |
| 1813 stop("All peptides have zero standard deviation. Cannot continue.") | 4031 cat("All peptides have zero standard deviation. Cannot continue.") |
| 4032 param_df_exit() | |
| 4033 knitr::knit_exit() | |
| 1814 } | 4034 } |
| 1815 if (length(zero_sd_rownames) > 0) { | 4035 if (length(zero_sd_rownames) > 0) { |
| 1816 cat( | 4036 cat( |
| 1817 sprintf("%d peptides with zero variance were removed from statistical consideration", | 4037 sprintf( |
| 1818 length(zero_sd_rownames) | 4038 "%d %s %s", |
| 4039 length(zero_sd_rownames), | |
| 4040 "peptides with zero variance", | |
| 4041 "were removed from statistical consideration" | |
| 1819 ) | 4042 ) |
| 1820 ) | 4043 ) |
| 1821 zap_named_rows <- function(df, nms) { | 4044 zap_named_rows <- function(df, nms) { |
| 1822 return(df[!(row.names(df) %in% nms), ]) | 4045 return(df[!(row.names(df) %in% nms), ]) |
| 1823 } | 4046 } |
| 1824 quant_data_imp <- zap_named_rows(quant_data_imp, zero_sd_rownames) | 4047 quant_data_imp <- |
| 1825 quant_data <- zap_named_rows(quant_data, zero_sd_rownames) | 4048 zap_named_rows(quant_data_imp, zero_sd_rownames) |
| 1826 full_data <- zap_named_rows(full_data, zero_sd_rownames) | 4049 quant_data <- |
| 4050 zap_named_rows(quant_data, zero_sd_rownames) | |
| 4051 full_data <- | |
| 4052 zap_named_rows(full_data, zero_sd_rownames) | |
| 4053 min_group_obs_count <- | |
| 4054 min_group_obs_count[names(min_group_obs_count) %notin% zero_sd_rownames] | |
| 1827 } | 4055 } |
| 1828 | 4056 |
| 1829 if (sum(is.na(quant_data)) > 0) { | 4057 if (sum(is.na(quant_data)) > 0) { |
| 1830 cat("\\leavevmode\\newpage\n") | 4058 cat("\\leavevmode\\newpage\n") |
| 1831 # data visualization | |
| 1832 old_par <- par( | |
| 1833 mai = par("mai") + c(0.5, 0, 0, 0) | |
| 1834 ) | |
| 1835 # Copy quant data to x | 4059 # Copy quant data to x |
| 1836 x <- quant_data | 4060 x <- quant_data |
| 1837 # x gets to have values of: | 4061 # x gets to have values of: |
| 1838 # - NA for observed values | 4062 # - NA for observed values |
| 1839 # - 1 for missing values | 4063 # - 1 for missing values |
| 1858 max(red_dots, blue_dots, na.rm = TRUE) | 4082 max(red_dots, blue_dots, na.rm = TRUE) |
| 1859 ) | 4083 ) |
| 1860 show_stripchart <- | 4084 show_stripchart <- |
| 1861 50 > (count_red + count_blue) / length(sample_name_matches) | 4085 50 > (count_red + count_blue) / length(sample_name_matches) |
| 1862 if (show_stripchart) { | 4086 if (show_stripchart) { |
| 1863 boxplot_sub <- "Light blue = data before imputation; Red = imputed data" | 4087 boxplot_sub <- "Light blue = data before imputation; Red = imputed data;" |
| 1864 } else { | 4088 } else { |
| 1865 boxplot_sub <- "" | 4089 boxplot_sub <- "" |
| 1866 } | 4090 } |
| 1867 | 4091 |
| 1868 # Vertical plot | 4092 # Vertical plot |
| 1869 colnames(blue_dots) <- sample_name_matches | 4093 colnames(blue_dots) <- sample_name_matches |
| 1870 boxplot( | 4094 my_ppep_distrib_bxp( |
| 1871 blue_dots | 4095 x = blue_dots |
| 1872 , las = 2 # "always vertical" | 4096 , sample_name_grow = sample_name_grow |
| 1873 , cex.axis = 0.9 * sample_name_shrink | |
| 1874 , col = const_boxplot_fill | |
| 1875 , ylim = ylim | |
| 1876 , main = "Peptide intensities after eliminating unusable peptides" | 4097 , main = "Peptide intensities after eliminating unusable peptides" |
| 1877 , sub = boxplot_sub | 4098 , varwidth = boxplot_varwidth |
| 4099 , sub = paste(boxplot_sub, "Box widths reflect number of peptides for sample") | |
| 1878 , xlab = "Sample" | 4100 , xlab = "Sample" |
| 1879 , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") | 4101 , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") |
| 4102 , col = const_boxplot_fill | |
| 4103 , notch = FALSE | |
| 4104 , ylim = ylim | |
| 1880 ) | 4105 ) |
| 1881 | 4106 |
| 1882 if (show_stripchart) { | 4107 if (show_stripchart) { |
| 1883 # Points | 4108 # Points |
| 1884 # ref: https://r-charts.com/distribution/add-points-boxplot/ | 4109 # ref: https://r-charts.com/distribution/add-points-boxplot/ |
| 1886 stripchart( | 4111 stripchart( |
| 1887 blue_dots, # Data | 4112 blue_dots, # Data |
| 1888 method = "jitter", # Random noise | 4113 method = "jitter", # Random noise |
| 1889 jitter = const_stripchart_jitter, | 4114 jitter = const_stripchart_jitter, |
| 1890 pch = 19, # Pch symbols | 4115 pch = 19, # Pch symbols |
| 1891 cex = const_stripsmall_cex, # Size of symbols reduced | 4116 cex = const_stripchart_cex, # Size of symbols reduced |
| 1892 col = "lightblue", # Color of the symbol | 4117 col = "lightblue", # Color of the symbol |
| 1893 vertical = TRUE, # Vertical mode | 4118 vertical = TRUE, # Vertical mode |
| 1894 add = TRUE # Add it over | 4119 add = TRUE # Add it over |
| 1895 ) | 4120 ) |
| 1896 stripchart( | 4121 stripchart( |
| 1897 red_dots, # Data | 4122 red_dots, # Data |
| 1898 method = "jitter", # Random noise | 4123 method = "jitter", # Random noise |
| 1899 jitter = const_stripchart_jitter, | 4124 jitter = const_stripchart_jitter, |
| 1900 pch = 19, # Pch symbols | 4125 pch = 19, # Pch symbols |
| 1901 cex = const_stripsmall_cex, # Size of symbols reduced | 4126 cex = const_stripchart_cex, # Size of symbols reduced |
| 1902 col = "red", # Color of the symbol | 4127 col = "red", # Color of the symbol |
| 1903 vertical = TRUE, # Vertical mode | 4128 vertical = TRUE, # Vertical mode |
| 1904 add = TRUE # Add it over | 4129 add = TRUE # Add it over |
| 1905 ) | 4130 ) |
| 1906 | 4131 |
| 1907 } | 4132 } |
| 1908 if (TRUE) { | 4133 } |
| 1909 # show measured values in blue on left half-violin plot | 4134 ``` |
| 1910 cat("\\leavevmode\n\\quad\n\n\\quad\n\n") | 4135 |
| 1911 vioplot::vioplot( | 4136 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} |
| 1912 x = lapply(blue_dots, function(x) x[!is.na(x)]), | 4137 if (sum(is.na(quant_data)) > 0) { |
| 1913 col = "lightblue1", | 4138 # show measured values in blue on left half-violin plot |
| 1914 side = "left", | 4139 cat("\\leavevmode\n\\quad\n\n\\quad\n\n") |
| 1915 plotCentre = "line", | 4140 old_par <- par( |
| 1916 ylim = ylim_save, | 4141 mai = par("mai") + c(g_ppep_distrib_ctl$mai_bottom, 0, 0, 0), |
| 1917 main = "Distributions of observed and imputed data", | 4142 cex.axis = g_ppep_distrib_ctl$axis, |
| 1918 sub = "Light blue = observed data; Pink = imputed data", | 4143 cex.lab = 1.2 |
| 1919 las = 2, | 4144 ) |
| 1920 cex.axis = 0.9 * sample_name_shrink, | 4145 tryCatch( |
| 1921 xlab = "Sample", | 4146 { |
| 1922 ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") | 4147 vioplot::vioplot( |
| 1923 ) | 4148 x = lapply(blue_dots, function(x) x[!is.na(x)]), |
| 1924 red_violins <- lapply(red_dots, function(x) x[!is.na(x)]) | 4149 col = "lightblue1", |
| 1925 cols_to_delete <- c() | 4150 side = "left", |
| 1926 for (ix in seq_len(length(red_violins))) { | 4151 plotCentre = "line", |
| 1927 if (length(red_violins[[ix]]) < 1) { | 4152 ylim = ylim_save, |
| 1928 cols_to_delete <- c(cols_to_delete, ix) | 4153 main = "Distributions of observed and imputed data", |
| 4154 sub = NULL, | |
| 4155 las = 2, | |
| 4156 xlab = NULL, | |
| 4157 ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") | |
| 4158 ) | |
| 4159 title( | |
| 4160 sub = "Light blue = observed data; Pink = imputed data", | |
| 4161 cex.sub = 1.0, | |
| 4162 line = g_ppep_distrib_ctl$xlab_line + 1 | |
| 4163 ) | |
| 4164 title( | |
| 4165 xlab = "Sample", | |
| 4166 line = g_ppep_distrib_ctl$xlab_line | |
| 4167 ) | |
| 4168 red_violins <- lapply(red_dots, function(x) x[!is.na(x)]) | |
| 4169 cols_to_delete <- c() | |
| 4170 for (ix in seq_len(length(red_violins))) { | |
| 4171 if (length(red_violins[[ix]]) < 1) { | |
| 4172 cols_to_delete <- c(cols_to_delete, ix) | |
| 4173 } | |
| 1929 } | 4174 } |
| 1930 } | 4175 # destroy any unimputable columns |
| 1931 # destroy any unimputable columns | 4176 if (!is.null(cols_to_delete)) { |
| 1932 if (!is.null(cols_to_delete)) { | 4177 red_violins <- red_violins[-cols_to_delete] |
| 1933 red_violins <- red_violins[-cols_to_delete] | 4178 } |
| 1934 } | 4179 # plot imputed values in red on right half-violin plot |
| 1935 # plot imputed values in red on right half-violin plot | 4180 vioplot::vioplot( |
| 1936 vioplot::vioplot( | 4181 x = red_violins, |
| 1937 x = red_violins, | 4182 col = "lightpink1", |
| 1938 col = "lightpink1", | 4183 side = "right", |
| 1939 side = "right", | 4184 plotCentre = "line", |
| 1940 plotCentre = "line", | 4185 add = TRUE |
| 1941 add = TRUE | 4186 ) |
| 1942 ) | 4187 |
| 1943 } | 4188 }, |
| 1944 | 4189 finally = par(old_par) |
| 1945 par(old_par) | 4190 ) |
| 1946 | 4191 |
| 1947 # density plot | 4192 # density plot |
| 1948 cat("\\leavevmode\n\n\n\n\n\n\n") | 4193 cat("\\leavevmode\n\n\n\n\n\n\n") |
| 1949 if (can_plot_before_after_imp) { | 4194 if (can_plot_before_after_imp) { |
| 1950 ylim <- c( | 4195 ylim <- c( |
| 1978 } | 4223 } |
| 1979 cat("\\leavevmode\\newpage\n") | 4224 cat("\\leavevmode\\newpage\n") |
| 1980 } | 4225 } |
| 1981 ``` | 4226 ``` |
| 1982 | 4227 |
| 1983 # Perform Quantile Normalization | 4228 # Quantile Normalization |
| 1984 | 4229 |
| 1985 The excellent `normalize.quantiles` function from | 4230 The excellent `normalize.quantiles` function from |
| 1986 *[the `preprocessCore` Bioconductor package](http://bioconductor.org/packages/release/bioc/html/preprocessCore.html)* | 4231 *[the `preprocessCore` Bioconductor package](http://bioconductor.org/packages/release/bioc/html/preprocessCore.html)* |
| 1987 performs "quantile normalization" as described Bolstad *et al.* (2003), | 4232 performs "quantile normalization" as described Bolstad *et al.* (2003), |
| 1988 DOI *[10.1093/bioinformatics/19.2.185](https://doi.org/10.1093%2Fbioinformatics%2F19.2.185)* | 4233 DOI *[10.1093/bioinformatics/19.2.185](https://doi.org/10.1093%2Fbioinformatics%2F19.2.185)* |
| 1989 and *its supplementary material [http://bmbolstad.com/misc/normalize/normalize.html](http://bmbolstad.com/misc/normalize/normalize.html)*, | 4234 and its supplementary material [http://bmbolstad.com/misc/normalize/normalize.html](http://bmbolstad.com/misc/normalize/normalize.html), |
| 1990 i.e., it assumes that the goal is to detect | 4235 i.e., it assumes that the goal is to detect |
| 1991 subtle differences among grossly similar samples (having similar distributions) | 4236 subtle differences among grossly similar samples (having similar distributions) |
| 1992 by equailzing intra-quantile quantitations. | 4237 by equalizing intra-quantile quantitations^[Unfortunately, |
| 1993 Unfortunately, one software library upon which it depends | 4238 one software library upon which `preprocessCore` depends |
| 1994 *[suffers from a concurrency defect](https://support.bioconductor.org/p/122925/#9135989)* | 4239 *[suffers from a concurrency defect](https://support.bioconductor.org/p/122925/#9135989)* |
| 1995 that requires that a specific, non-concurrent version of the library be | 4240 that requires that a specific, non-concurrent version of the library (`openblas` version $0.3.3$) be |
| 1996 installed. The installation command equivalent to what was used to install the library to produce the results presented here is: | 4241 installed. The installation command equivalent to what was used to install the library to produce the results presented here is: |
| 1997 ``` | 4242 \linebreak |
| 1998 conda install bioconductor-preprocesscore openblas=0.3.3 | 4243 ` conda install bioconductor-preprocesscore openblas=0.3.3`]. |
| 1999 ``` | |
| 2000 | 4244 |
| 2001 | 4245 |
| 2002 <!-- | 4246 <!-- |
| 2003 # Apply quantile normalization using preprocessCore::normalize.quantiles | 4247 # Apply quantile normalization using preprocessCore::normalize.quantiles |
| 2004 # --- | 4248 # --- |
| 2005 # tool repository: http://bioconductor.org/packages/release/bioc/html/preprocessCore.html | 4249 # tool repository: http://bioconductor.org/packages/release/bioc/html/preprocessCore.html |
| 2006 # except this: https://support.bioconductor.org/p/122925/#9135989 | 4250 # except this: https://support.bioconductor.org/p/122925/#9135989 |
| 2007 # says to install it like this: | 4251 # says to install it like this: |
| 2008 # ``` | 4252 # ``` |
| 2009 # BiocManager::install("preprocessCore", configure.args="--disable-threading", force = TRUE, lib=.libPaths()[1]) | 4253 # BiocManager::install("preprocessCore", configure.args="--disable-threading", force = TRUE, lib=.libPaths()[1]) |
| 2010 # ``` | 4254 # ``` |
| 2011 # conda installation (necessary because of a bug in recent openblas): | 4255 # conda installation (necessary because of a bug in recent openblas): |
| 2012 # conda install bioconductor-preprocesscore openblas=0.3.3 | 4256 # conda install bioconductor-preprocesscore openblas=0.3.3 |
| 2013 # ... | 4257 # ... |
| 2014 # --- | 4258 # --- |
| 2015 # normalize.quantiles {preprocessCore} -- Quantile Normalization | 4259 # normalize.quantiles {preprocessCore} -- Quantile Normalization |
| 2016 # | 4260 # |
| 2017 # Description: | 4261 # Description: |
| 2018 # Using a normalization based upon quantiles, this function normalizes a | 4262 # Using a normalization based upon quantiles, this function normalizes a |
| 2019 # matrix of probe level intensities. | 4263 # matrix of probe level intensities. |
| 2020 # | 4264 # |
| 2021 # THIS FUNCTIONS WILL HANDLE MISSING DATA (ie NA values), based on the | 4265 # THIS FUNCTIONS WILL HANDLE MISSING DATA (ie NA values), based on the |
| 2022 # assumption that the data is missing at random. | 4266 # assumption that the data is missing at random. |
| 2023 # | 4267 # |
| 2024 # Usage: | 4268 # Usage: |
| 2025 # normalize.quantiles(x, copy = TRUE, keep.names = FALSE) | 4269 # normalize.quantiles(x, copy = TRUE, keep.names = FALSE) |
| 2026 # | 4270 # |
| 2027 # Arguments: | 4271 # Arguments: |
| 2028 # | 4272 # |
| 2029 # - x: A matrix of intensities where each column corresponds to a chip and each row is a probe. | 4273 # - x: A matrix of intensities where each column corresponds to a chip and each row is a probe. |
| 2030 # | 4274 # |
| 2031 # - copy: Make a copy of matrix before normalizing. Usually safer to work with a copy, | 4275 # - copy: Make a copy of matrix before normalizing. Usually safer to work with a copy, |
| 2032 # but in certain situations not making a copy of the matrix, but instead normalizing | 4276 # but in certain situations not making a copy of the matrix, but instead normalizing |
| 2033 # it in place will be more memory friendly. | 4277 # it in place will be more memory friendly. |
| 2034 # | 4278 # |
| 2035 # - keep.names: Boolean option to preserve matrix row and column names in output. | 4279 # - keep.names: Boolean option to preserve matrix row and column names in output. |
| 2036 # | 4280 # |
| 2037 # Details: | 4281 # Details: |
| 2038 # This method is based upon the concept of a quantile-quantile plot extended to n dimensions. | 4282 # This method is based upon the concept of a quantile-quantile plot extended to n dimensions. |
| 2039 # No special allowances are made for outliers. If you make use of quantile normalization | 4283 # No special allowances are made for outliers. If you make use of quantile normalization |
| 2040 # please cite Bolstad et al, Bioinformatics (2003). | 4284 # please cite Bolstad et al, Bioinformatics (2003). |
| 2041 # | 4285 # |
| 2042 # This functions will handle missing data (ie NA values), based on | 4286 # This functions will handle missing data (ie NA values), based on |
| 2043 # the assumption that the data is missing at random. | 4287 # the assumption that the data is missing at random. |
| 2044 # | 4288 # |
| 2045 # Note that the current implementation optimizes for better memory usage | 4289 # Note that the current implementation optimizes for better memory usage |
| 2046 # at the cost of some additional run-time. | 4290 # at the cost of some additional run-time. |
| 2047 # | 4291 # |
| 2048 # Value: A normalized matrix. | 4292 # Value: A normalized matrix. |
| 2049 # | 4293 # |
| 2050 # Author: Ben Bolstad, bmbolstad.com | 4294 # Author: Ben Bolstad, bmbolstad.com |
| 2051 # | 4295 # |
| 2052 # References | 4296 # References |
| 2053 # | 4297 # |
| 2054 # - Bolstad, B (2001) Probe Level Quantile Normalization of High Density Oligonucleotide | 4298 # - Bolstad, B (2001) Probe Level Quantile Normalization of High Density Oligonucleotide |
| 2055 # Array Data. Unpublished manuscript http://bmbolstad.com/stuff/qnorm.pdf | 4299 # Array Data. Unpublished manuscript http://bmbolstad.com/stuff/qnorm.pdf |
| 2056 # | 4300 # |
| 2057 # - Bolstad, B. M., Irizarry R. A., Astrand, M, and Speed, T. P. (2003) A Comparison of | 4301 # - Bolstad, B. M., Irizarry R. A., Astrand, M, and Speed, T. P. (2003) A Comparison of |
| 2058 # Normalization Methods for High Density Oligonucleotide Array Data Based on Bias | 4302 # Normalization Methods for High Density Oligonucleotide Array Data Based on Bias |
| 2059 # and Variance. Bioinformatics 19(2), pp 185-193. DOI 10.1093/bioinformatics/19.2.185 | 4303 # and Variance. Bioinformatics 19(2), pp 185-193. DOI 10.1093/bioinformatics/19.2.185 |
| 2060 # http://bmbolstad.com/misc/normalize/normalize.html | 4304 # http://bmbolstad.com/misc/normalize/normalize.html |
| 2061 # ... | 4305 # ... |
| 2062 --> | 4306 --> |
| 2063 ```{r echo = FALSE, results = 'asis'} | 4307 ```{r echo = FALSE, results = 'asis'} |
| 2064 | 4308 |
| 4309 if (print_nb_messages) nbe(see_variable(nrow(quant_data_imp)), "\n") | |
| 2065 if (nrow(quant_data_imp) > 0) { | 4310 if (nrow(quant_data_imp) > 0) { |
| 2066 quant_data_imp_qn <- preprocessCore::normalize.quantiles( | 4311 quant_data_imp_qn <- preprocessCore::normalize.quantiles( |
| 2067 as.matrix(quant_data_imp), keep.names = TRUE | 4312 as.matrix(quant_data_imp), keep.names = TRUE |
| 2068 ) | 4313 ) |
| 2069 } else { | 4314 } else { |
| 2070 quant_data_imp_qn <- as.matrix(quant_data_imp) | 4315 quant_data_imp_qn <- as.matrix(quant_data_imp) |
| 2071 } | 4316 } |
| 2072 | 4317 |
| 4318 if (print_nb_messages) nbe(see_variable(nrow(quant_data_imp_qn)), "\n") | |
| 4319 | |
| 2073 quant_data_imp_qn <- as.data.frame(quant_data_imp_qn) | 4320 quant_data_imp_qn <- as.data.frame(quant_data_imp_qn) |
| 2074 | |
| 2075 write_debug_file(quant_data_imp_qn) | 4321 write_debug_file(quant_data_imp_qn) |
| 2076 | 4322 |
| 2077 quant_data_imp_qn_log <- log10(quant_data_imp_qn) | 4323 quant_data_imp_qn_log <- log10(quant_data_imp_qn) |
| 2078 | |
| 2079 write_debug_file(quant_data_imp_qn_log) | 4324 write_debug_file(quant_data_imp_qn_log) |
| 2080 | 4325 |
| 4326 if (print_nb_messages) nbe(see_variable(nrow(quant_data_imp_qn_log)), "\n") | |
| 4327 if (print_nb_messages) nbe(see_variable(ncol(quant_data_imp_qn_log)), "\n") | |
| 4328 | |
| 2081 quant_data_imp_qn_ls <- t(scale(t(log10(quant_data_imp_qn)))) | 4329 quant_data_imp_qn_ls <- t(scale(t(log10(quant_data_imp_qn)))) |
| 2082 | 4330 |
| 2083 sel <- apply(quant_data_imp_qn_ls, 1, any_nan) | 4331 sel <- row_apply(quant_data_imp_qn_ls, any_nan) |
| 2084 quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls | 4332 quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls |
| 2085 | 4333 |
| 2086 quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls2[which(sel), ] | 4334 quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls2[which(sel), ] |
| 2087 quant_data_imp_qn_ls2 <- as.data.frame(quant_data_imp_qn_ls2) | 4335 quant_data_imp_qn_ls2 <- as.data.frame(quant_data_imp_qn_ls2) |
| 2088 | 4336 |
| 2093 | 4341 |
| 2094 # Create data.frame used by ANOVA analysis | 4342 # Create data.frame used by ANOVA analysis |
| 2095 data_table_imp_qn_lt <- cbind(full_data[1:9], quant_data_imp_qn_log) | 4343 data_table_imp_qn_lt <- cbind(full_data[1:9], quant_data_imp_qn_log) |
| 2096 ``` | 4344 ``` |
| 2097 | 4345 |
| 2098 <!-- ACE insertion begin --> | |
| 2099 ## Are normalized, imputed, log-transformed sample distributions similar? | 4346 ## Are normalized, imputed, log-transformed sample distributions similar? |
| 2100 | 4347 |
| 2101 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} | 4348 ```{r echo = FALSE, fig.dim = c(9, 6.5), results = 'asis'} |
| 2102 | 4349 |
| 2103 # Save unimputed quant_data_log for plotting below | 4350 # Save unimputed quant_data_log for plotting below |
| 2104 unimputed_quant_data_log <- quant_data_log | 4351 unimputed_quant_data_log <- quant_data_log |
| 2105 | 4352 |
| 2106 # log10 transform (after preparing for zero values, | 4353 # log10 transform (after preparing for zero values, |
| 2119 ) | 4366 ) |
| 2120 cat("\n\n\n") | 4367 cat("\n\n\n") |
| 2121 | 4368 |
| 2122 | 4369 |
| 2123 # data visualization | 4370 # data visualization |
| 4371 if (TRUE) { | |
| 4372 | |
| 4373 my_ppep_distrib_bxp( | |
| 4374 x = quant_data_log | |
| 4375 , sample_name_grow = sample_name_grow | |
| 4376 , main = "Peptide intensities for each sample" | |
| 4377 , varwidth = boxplot_varwidth | |
| 4378 , sub = NULL | |
| 4379 , xlab = "Sample" | |
| 4380 , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") | |
| 4381 , col = const_boxplot_fill | |
| 4382 , notch = boxplot_notch | |
| 4383 ) | |
| 4384 | |
| 4385 } else { | |
| 4386 | |
| 2124 old_par <- par( | 4387 old_par <- par( |
| 2125 mai = par("mai") + c(0.5, 0, 0, 0) | 4388 mai = par("mai") + c(0.5, 0, 0, 0) |
| 2126 , oma = par("oma") + c(0.5, 0, 0, 0) | 4389 , oma = par("oma") + c(0.5, 0, 0, 0) |
| 2127 ) | 4390 ) |
| 2128 # ref: https://r-charts.com/distribution/add-points-boxplot/ | 4391 # ref: https://r-charts.com/distribution/add-points-boxplot/ |
| 2129 # Vertical plot | 4392 # Vertical plot |
| 2130 colnames(quant_data_log) <- sample_name_matches | 4393 colnames(quant_data_log) <- sample_name_matches |
| 2131 boxplot( | 4394 boxplot( |
| 2132 quant_data_log | 4395 quant_data_log |
| 2133 , las = 2 | 4396 , las = 2 |
| 2134 , cex.axis = 0.9 * sample_name_shrink | 4397 , cex.axis = 0.9 * sample_name_grow |
| 2135 , col = const_boxplot_fill | 4398 , col = const_boxplot_fill |
| 2136 , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") | 4399 , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") |
| 2137 , xlab = "Sample" | 4400 , xlab = "Sample" |
| 4401 , notch = boxplot_notch | |
| 4402 , varwidth = boxplot_varwidth | |
| 2138 ) | 4403 ) |
| 2139 par(old_par) | 4404 par(old_par) |
| 4405 } | |
| 2140 } else { | 4406 } else { |
| 2141 cat("There are no peptides to plot\n") | 4407 cat("There are no peptides to plot\n") |
| 2142 } | 4408 } |
| 2143 | 4409 |
| 2144 cat("\n\n\n") | 4410 cat("\n\n\n") |
| 2160 cat("\\leavevmode\\newpage\n") | 4426 cat("\\leavevmode\\newpage\n") |
| 2161 ``` | 4427 ``` |
| 2162 | 4428 |
| 2163 # ANOVA Analysis | 4429 # ANOVA Analysis |
| 2164 | 4430 |
| 2165 ```{r, echo = FALSE} | 4431 ## Assignment of $p$-value and quality score |
| 4432 | |
| 4433 For each phosphopeptide, ANOVA analysis was performed to compute a $p$-value representing the evidence against the "null hypothesis" ($H_0$) that the intensity does not vary significantly among sample groups. | |
| 4434 | |
| 4435 However, because as more and more phosphopeptides are tested, there is increasd probability that, by random chance, a given peptide will have a $p$-value that appears to indicate significance. For this reason, the $p$-values were adjusted by applying the False Discovery Rate (FDR) correction from Benjamini and Hochberg (1995) [doi:10.1111/j.2517-6161.1995.tb02031.x](https:/doi.org/10.1111/j.2517-6161.1995.tb02031.x). | |
| 4436 | |
| 4437 Furthermore, to give more weight to phosphopeptides having fewer missing values, an (arbitrarily defined) quality score was assigned to each, defined as: | |
| 4438 | |
| 4439 $$ | |
| 4440 \textit{quality}_j | |
| 4441 = \frac{1 + o_{j}}{v_{j}(1 + o_{j}) + 0.005} | |
| 4442 $$ | |
| 4443 | |
| 4444 where: | |
| 4445 | |
| 4446 - $o_j$ is the minimum number of non-missing observations per sample group for substrate $j$ for all sample groups, and | |
| 4447 - $v_j$ is the FDR-adjusted ANOVA $p$-value for substrate $j$. | |
| 4448 | |
| 4449 | |
| 4450 ```{r, echo = FALSE, results = 'asis'} | |
| 4451 cat("\\newpage\n") | |
| 4452 | |
| 2166 # Make new data frame containing only Phosphopeptides | 4453 # Make new data frame containing only Phosphopeptides |
| 2167 # to connect preANOVA to ANOVA (connect_df) | 4454 # to connect preANOVA to ANOVA (connect_df) |
| 2168 connect_df <- data.frame( | 4455 connect_df <- data.frame( |
| 2169 data_table_imp_qn_lt$Phosphopeptide | 4456 data_table_imp_qn_lt$Phosphopeptide |
| 2170 , data_table_imp_qn_lt[, first_data_column] | 4457 , data_table_imp_qn_lt[, first_data_column] |
| 2171 ) | 4458 ) |
| 2172 colnames(connect_df) <- c("Phosphopeptide", "Intensity") | 4459 colnames(connect_df) <- c("Phosphopeptide", "Intensity") |
| 2173 ``` | 4460 ``` |
| 2174 | 4461 |
| 2175 ```{r anova, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} | 4462 ```{r anova, echo = FALSE, fig.dim = c(10, 12), results = 'asis'} |
| 2176 count_of_treatment_levels <- length(levels(sample_treatment_levels)) | 4463 g_can_run_ksea <- FALSE |
| 4464 old_oma <- par("oma") | |
| 2177 if (count_of_treatment_levels < 2) { | 4465 if (count_of_treatment_levels < 2) { |
| 2178 nuke_control_sequences <- | |
| 2179 function(s) { | |
| 2180 s <- gsub("[\\]", "xyzzy_plugh", s) | |
| 2181 s <- gsub("[$]", "\\\\$", s) | |
| 2182 s <- gsub("xyzzy_plugh", "$\\\\backslash$", s) | |
| 2183 return(s) | |
| 2184 } | |
| 2185 cat( | 4466 cat( |
| 2186 "ERROR!!!! Cannot perform ANOVA analysis", | 4467 "ERROR!!!! Cannot perform ANOVA analysis", |
| 2187 "(see next page)\\newpage\n" | 4468 "(see next page)\\newpage\n" |
| 2188 ) | 4469 ) |
| 2189 cat( | 4470 cat( |
| 2195 cat("Unparsed sample names are:\n\n\n", | 4476 cat("Unparsed sample names are:\n\n\n", |
| 2196 "\\begin{quote}\n", | 4477 "\\begin{quote}\n", |
| 2197 paste(names(quant_data_imp_qn_log), collapse = "\n\n\n"), | 4478 paste(names(quant_data_imp_qn_log), collapse = "\n\n\n"), |
| 2198 "\n\\end{quote}\n\n") | 4479 "\n\\end{quote}\n\n") |
| 2199 | 4480 |
| 2200 regex_sample_names <- nuke_control_sequences(regex_sample_names) | 4481 regex_sample_names <- latex_printable_control_seqs(regex_sample_names) |
| 2201 | 4482 |
| 2202 cat("\\leavevmode\n\n\n") | 4483 cat("\\leavevmode\n\n\n") |
| 2203 cat("Parsing rule for SampleNames is", | 4484 cat("Parsing rule for SampleNames is", |
| 2204 "\n\n\n", | 4485 "\n\n\n", |
| 2205 "\\text{'", | 4486 "\\text{'", |
| 2211 cat("\nParsed sample names are:\n", | 4492 cat("\nParsed sample names are:\n", |
| 2212 "\\begin{quote}\n", | 4493 "\\begin{quote}\n", |
| 2213 paste(sample_name_matches, collapse = "\n\n\n"), | 4494 paste(sample_name_matches, collapse = "\n\n\n"), |
| 2214 "\n\\end{quote}\n\n") | 4495 "\n\\end{quote}\n\n") |
| 2215 | 4496 |
| 2216 regex_sample_grouping <- nuke_control_sequences(regex_sample_grouping) | 4497 regex_sample_grouping <- latex_printable_control_seqs(regex_sample_grouping) |
| 2217 | 4498 |
| 2218 cat("\\leavevmode\n\n\n") | 4499 cat("\\leavevmode\n\n\n") |
| 2219 cat("Parsing rule for SampleGrouping is", | 4500 cat("Parsing rule for SampleGrouping is", |
| 2220 "\n\n\n", | 4501 "\n\n\n", |
| 2221 "\\text{'", | 4502 "\\text{'", |
| 2230 paste(regmatches(sample_name_matches, rx_match), collapse = "\n\n\n"), | 4511 paste(regmatches(sample_name_matches, rx_match), collapse = "\n\n\n"), |
| 2231 "\n\\end{quote}\n\n") | 4512 "\n\\end{quote}\n\n") |
| 2232 | 4513 |
| 2233 } else { | 4514 } else { |
| 2234 | 4515 |
| 2235 p_value_data_anova_ps <- | 4516 if (print_nb_messages) nbe("computing p_value_data_anova_ps\n") |
| 2236 apply( | 4517 if (print_nb_messages) nbe(see_variable(nrow(quant_data_imp_qn_log)), "\n") |
| 2237 quant_data_imp_qn_log, | 4518 if (print_nb_messages) nbe(see_variable(ncol(quant_data_imp_qn_log)), "\n") |
| 2238 1, | 4519 if (print_nb_messages) nbe(see_variable(quant_data_imp_qn_log[, ".NE.7C"]), "\n") |
| 2239 anova_func, | 4520 if (print_nb_messages) nbe(see_variable(quant_data_imp_qn_log), "\n") |
| 2240 grouping_factor = sample_treatment_levels, | 4521 if (print_nb_messages) nbe(see_variable(anova_func), "\n") |
| 2241 one_way_f = one_way_all_categories | 4522 if (print_nb_messages) nbe(see_variable(smpl_trt), "\n") |
| 2242 ) | 4523 if (print_nb_messages) nbe(see_variable(one_way_all_categories), "\n") |
| 4524 tryCatch( | |
| 4525 { | |
| 4526 p_value_data_anova_ps <- | |
| 4527 row_apply( | |
| 4528 quant_data_imp_qn_log, | |
| 4529 anova_func, | |
| 4530 grouping_factor = smpl_trt, | |
| 4531 one_way_f = one_way_all_categories | |
| 4532 ) | |
| 4533 }, | |
| 4534 error = function(e) { | |
| 4535 mesg <- paste("Could not compute ANOVA because", e$message) | |
| 4536 cat("\n\n", mesg, "\n\n") | |
| 4537 param_df_noexit(e) | |
| 4538 sink(stderr()) | |
| 4539 cat("\n\n", mesg, "\n\n") | |
| 4540 values <- paste(param_df$parameter, "=", param_df$value, collapse = "\n") | |
| 4541 cat(values) | |
| 4542 sink() | |
| 4543 knitr::knit_exit() | |
| 4544 exit(code = 1) | |
| 4545 } | |
| 4546 ) | |
| 4547 if (print_nb_messages) nbe(see_variable(p_value_data_anova_ps), "\n") | |
| 2243 | 4548 |
| 2244 p_value_data_anova_ps_fdr <- | 4549 p_value_data_anova_ps_fdr <- |
| 2245 p.adjust(p_value_data_anova_ps, method = "fdr") | 4550 p.adjust(p_value_data_anova_ps, method = "fdr") |
| 4551 my_ppep_v <- full_data[, 1] | |
| 4552 p_value_data <- list( | |
| 4553 phosphopeptide = my_ppep_v, | |
| 4554 raw_anova_p = p_value_data_anova_ps, | |
| 4555 fdr_adjusted_anova_p = p_value_data_anova_ps_fdr, | |
| 4556 missing_values = rowSums(is.na(quant_data)), | |
| 4557 min_group_obs_count = min_group_obs_count | |
| 4558 ) | |
| 2246 p_value_data <- data.frame( | 4559 p_value_data <- data.frame( |
| 2247 phosphopeptide = full_data[, 1], | 4560 phosphopeptide = my_ppep_v, |
| 2248 raw_anova_p = p_value_data_anova_ps, | 4561 raw_anova_p = p_value_data_anova_ps, |
| 2249 fdr_adjusted_anova_p = p_value_data_anova_ps_fdr | 4562 fdr_adjusted_anova_p = p_value_data_anova_ps_fdr, |
| 2250 ) | 4563 missing_values = rowSums(is.na(quant_data)), |
| 4564 min_group_obs_count = min_group_obs_count | |
| 4565 ) | |
| 4566 p_value_data$quality <- 1.0 / with( | |
| 4567 p_value_data, | |
| 4568 fdr_adjusted_anova_p + 0.005 / (1 + min_group_obs_count) | |
| 4569 ) | |
| 4570 | |
| 4571 p_value_data$ranking <- | |
| 4572 with( | |
| 4573 p_value_data, | |
| 4574 switch( | |
| 4575 g_intensity_hm_criteria, | |
| 4576 "quality" = order(-quality), | |
| 4577 "na_count" = order(missing_values, fdr_adjusted_anova_p), | |
| 4578 # the default is "p_value" | |
| 4579 order(fdr_adjusted_anova_p) | |
| 4580 ) | |
| 4581 ) | |
| 4582 p_value_data <- p_value_data[p_value_data$ranking, , drop = FALSE] | |
| 4583 | |
| 4584 write.table( | |
| 4585 p_value_data, | |
| 4586 file = "p_value_data.txt", | |
| 4587 sep = "\t", | |
| 4588 col.names = TRUE, | |
| 4589 row.names = FALSE, | |
| 4590 quote = FALSE | |
| 4591 ) | |
| 4592 | |
| 2251 | 4593 |
| 2252 # output ANOVA file to constructed filename, | 4594 # output ANOVA file to constructed filename, |
| 2253 # e.g. "Outputfile_pST_ANOVA_STEP5.txt" | 4595 # e.g. "Outputfile_pST_ANOVA_STEP5.txt" |
| 2254 # becomes "Outpufile_pST_ANOVA_STEP5_FDR0.05.txt" | 4596 # becomes "Outputfile_pST_ANOVA_STEP5_FDR0.05.txt" |
| 2255 | 4597 |
| 2256 # Re-output datasets to include p-values | 4598 # Re-output datasets to include p-values |
| 2257 metadata_plus_p <- cbind(full_data[1:9], p_value_data[, 2:3]) | 4599 metadata_plus_p <- cbind(full_data[1:9], p_value_data[, 2:ncol(p_value_data)]) |
| 2258 write.table( | 4600 write.table( |
| 2259 cbind(metadata_plus_p, quant_data_imp), | 4601 cbind(metadata_plus_p, quant_data_imp), |
| 2260 file = imputed_data_filename, | 4602 file = imputed_data_filename, |
| 2261 sep = "\t", | 4603 sep = "\t", |
| 2262 col.names = TRUE, | 4604 col.names = TRUE, |
| 2272 row.names = FALSE, | 4614 row.names = FALSE, |
| 2273 quote = FALSE | 4615 quote = FALSE |
| 2274 ) | 4616 ) |
| 2275 | 4617 |
| 2276 | 4618 |
| 2277 p_value_data <- | |
| 2278 p_value_data[order(p_value_data$fdr_adjusted_anova_p), ] | |
| 2279 | |
| 2280 first_page_suppress <- 1 | 4619 first_page_suppress <- 1 |
| 2281 number_of_peptides_found <- 0 | 4620 number_of_peptides_found <- 0 |
| 2282 cutoff <- val_fdr[1] | 4621 cutoff <- val_fdr[1] |
| 2283 for (cutoff in val_fdr) { | 4622 for (cutoff in val_fdr) { |
| 2284 if (number_of_peptides_found > 49) { | 4623 #loop through FDR cutoffs |
| 4624 if (number_of_peptides_found > g_intensity_hm_rows - 1) { | |
| 2285 cat("\\leavevmode\n\n\n") | 4625 cat("\\leavevmode\n\n\n") |
| 2286 break | 4626 break |
| 2287 } | 4627 } |
| 2288 | 4628 |
| 2289 #loop through FDR cutoffs | 4629 bool_1 <- (p_value_data$fdr_adjusted_anova_p < cutoff) |
| 4630 bool_2 <- (p_value_data$min_group_obs_count >= g_intensity_min_per_class) | |
| 4631 g_can_run_ksea <- g_can_run_ksea || (sum(bool_2) > 0) | |
| 4632 bool_4 <- (p_value_data$quality >= params$minQuality) | |
| 4633 bool_3 <- as.logical( | |
| 4634 as.integer(bool_1) * | |
| 4635 as.integer(bool_2) * | |
| 4636 as.integer(bool_4) | |
| 4637 ) | |
| 4638 if (print_trace_messages) { | |
| 4639 if (length(bool_1) > 30) { | |
| 4640 cat_variable(bool_1, force_str = TRUE) | |
| 4641 cat_variable(bool_2, force_str = TRUE) | |
| 4642 cat_variable(bool_3, force_str = TRUE) | |
| 4643 } else { | |
| 4644 cat_variable(bool_1, suffix = "\n\n") | |
| 4645 cat_variable(bool_2, suffix = "\n\n") | |
| 4646 cat_variable(bool_3, suffix = "\n\n") | |
| 4647 } | |
| 4648 cat_variable(length(bool_3), digits = 0, suffix = "; ") | |
| 4649 cat_variable(sum(bool_3), digits = 0, suffix = "\n\n") | |
| 4650 } | |
| 2290 | 4651 |
| 2291 filtered_p <- | 4652 filtered_p <- |
| 2292 p_value_data[ | 4653 p_value_data[bool_3, , drop = FALSE] |
| 2293 which(p_value_data$fdr_adjusted_anova_p < cutoff), | 4654 filtered_p <- |
| 2294 , drop = FALSE | 4655 filtered_p[!is.na(filtered_p$phosphopeptide), , drop = FALSE] |
| 2295 ] | 4656 |
| 4657 if (print_trace_messages) | |
| 4658 cat_variable(filtered_p, force_str = TRUE) | |
| 4659 | |
| 4660 have_remaining_peptides <- sum(bool_3, na.rm = TRUE) > 0 | |
| 4661 filter_result_string <- | |
| 4662 sprintf( | |
| 4663 "%s, %s of %0.0f peptides remained having both %s and %s.\n\n", | |
| 4664 "After filtering for ANOVA results", | |
| 4665 if (have_remaining_peptides) | |
| 4666 as.character(sum(bool_3, na.rm = TRUE)) | |
| 4667 else | |
| 4668 "none", | |
| 4669 length(bool_3), | |
| 4670 sprintf("adjusted p-value < %s", as.character(signif(cutoff, 2))), | |
| 4671 sprintf( | |
| 4672 "more than %0.0f observations in some groups", | |
| 4673 max(0, g_intensity_min_per_class - 1) | |
| 4674 ) | |
| 4675 ) | |
| 4676 | |
| 2296 filtered_data_filtered <- | 4677 filtered_data_filtered <- |
| 2297 quant_data_imp_qn_log[ | 4678 quant_data_imp_qn_log[ |
| 2298 rownames(filtered_p), | 4679 rownames(filtered_p), |
| 2299 , drop = FALSE | 4680 , drop = FALSE |
| 2300 ] | 4681 ] |
| 4682 # order by p-value | |
| 2301 filtered_data_filtered <- | 4683 filtered_data_filtered <- |
| 2302 filtered_data_filtered[ | 4684 filtered_data_filtered[ |
| 2303 order(filtered_p$fdr_adjusted_anova_p), | 4685 order(filtered_p$fdr_adjusted_anova_p), |
| 2304 , drop = FALSE | 4686 , drop = FALSE |
| 2305 ] | 4687 ] |
| 2306 | 4688 |
| 2307 # <!-- ACE insertion start --> | 4689 if (have_remaining_peptides && nrow(filtered_p) > 0 && nrow(filtered_data_filtered) > 0) { |
| 2308 | |
| 2309 if (nrow(filtered_p) && nrow(filtered_data_filtered) > 0) { | |
| 2310 if (first_page_suppress == 1) { | 4690 if (first_page_suppress == 1) { |
| 2311 first_page_suppress <- 0 | 4691 first_page_suppress <- 0 |
| 2312 } else { | 4692 } else { |
| 2313 cat("\\newpage\n") | 4693 cat("\\newpage\n") |
| 2314 } | 4694 } |
| 2315 if (nrow(filtered_data_filtered) > 1) { | 4695 latex_samepage({ |
| 2316 subsection_header(sprintf( | 4696 cat(filter_result_string) |
| 2317 "Intensity distributions for %d phosphopeptides whose adjusted p-value < %0.2f\n", | 4697 if (nrow(filtered_data_filtered) > 1) { |
| 2318 nrow(filtered_data_filtered), | 4698 cat(subsection_header(sprintf( |
| 2319 cutoff | 4699 "Intensity distributions for %d phosphopeptides\n", |
| 2320 )) | 4700 nrow(filtered_data_filtered) |
| 2321 } else { | 4701 ))) |
| 2322 subsection_header(sprintf( | 4702 } else { |
| 2323 "Intensity distribution for one phosphopeptide (%s) whose adjusted p-value < %0.2f\n", | 4703 cat(subsection_header(sprintf( |
| 2324 rownames(filtered_data_filtered)[1], | 4704 "Intensity distribution for one phosphopeptide (%s)\n", |
| 2325 cutoff | 4705 rownames(filtered_data_filtered)[1] |
| 2326 )) | 4706 ))) |
| 2327 } | 4707 } |
| 2328 cat("\n\n\n") | 4708 }) # end latex_samepage |
| 2329 cat("\n\n\n") | 4709 |
| 2330 | |
| 2331 old_oma <- par("oma") | |
| 2332 old_par <- par( | 4710 old_par <- par( |
| 2333 mai = (par("mai") + c(0.7, 0, 0, 0)) * c(1, 1, 0.3, 1), | 4711 mai = (par("mai") + c(0.7, 0, 0, 0)) * c(1, 1, 0.3, 1), |
| 2334 oma = old_oma * c(1, 1, 0.3, 1), | 4712 oma = old_oma * c(1, 1, 0.3, 1), |
| 2335 cex.main = 0.9, | 4713 cex.main = 0.9, |
| 2336 cex.axis = 0.7, | 4714 cex.axis = 0.7, |
| 2337 fin = c(9, 7.25) | 4715 fin = c(9, 7.25) |
| 2338 ) | 4716 ) |
| 2339 # ref: https://r-charts.com/distribution/add-points-boxplot/ | |
| 2340 # Vertical plot | 4717 # Vertical plot |
| 2341 colnames(filtered_data_filtered) <- sample_name_matches | 4718 colnames(filtered_data_filtered) <- sample_name_matches |
| 2342 tryCatch( | 4719 tryCatch( |
| 2343 boxplot( | 4720 boxplot( |
| 2344 filtered_data_filtered, | 4721 filtered_data_filtered, |
| 2345 main = "Imputed, normalized intensities", # no line plot | 4722 main = "Imputed, normalized intensities", # no line plot |
| 2346 las = 2, | 4723 las = 2, |
| 2347 cex.axis = 0.9 * sample_name_shrink, | 4724 cex.axis = 0.9 * sample_name_grow, |
| 2348 col = const_boxplot_fill, | 4725 col = const_boxplot_fill, |
| 2349 ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") | 4726 ylab = latex2exp::TeX("$log_{10}$(peptide intensity)"), |
| 4727 notch = FALSE, | |
| 4728 varwidth = boxplot_varwidth | |
| 2350 ), | 4729 ), |
| 2351 error = function(e) print(e) | 4730 error = function(e) { |
| 4731 print(e) | |
| 4732 cat_margins() | |
| 4733 } | |
| 4734 | |
| 2352 ) | 4735 ) |
| 2353 par(old_par) | 4736 par(old_par) |
| 2354 } else { | 4737 } else { |
| 2355 cat(sprintf( | 4738 cat(sprintf( |
| 2356 "%s < %0.2f\n\n\n\n\n", | 4739 "%s < %0.2f\n\n\n\n\n", |
| 2357 "No peptides were found to have cutoff adjusted p-value", | 4740 "No peptides were found to have cutoff adjusted p-value", |
| 2358 cutoff | 4741 cutoff |
| 2359 )) | 4742 )) |
| 2360 } | 4743 } |
| 2361 | 4744 |
| 2362 if (nrow(filtered_data_filtered) > 0) { | 4745 if (have_remaining_peptides && nrow(filtered_data_filtered) > 0) { |
| 2363 # Add Phosphopeptide column to anova_filtered table | 4746 # Add Phosphopeptide column to anova_filtered table |
| 2364 # The assumption here is that the first intensity is unique; | 4747 # The assumption here is that the first intensity is unique; |
| 2365 # this is a hokey assumption but almost definitely will | 4748 # this is a hokey assumption but almost definitely will |
| 2366 # be true in the real world unless there is a computation | 4749 # be true in the real world unless there is a computation |
| 2367 # error upstream. | 4750 # error upstream. |
| 2390 by.y = "Phosphopeptide" | 4773 by.y = "Phosphopeptide" |
| 2391 ) | 4774 ) |
| 2392 | 4775 |
| 2393 # Produce heatmap to visualize significance and the effect of imputation | 4776 # Produce heatmap to visualize significance and the effect of imputation |
| 2394 | 4777 |
| 2395 anova_filtered_merge_format <- sapply( | |
| 2396 X = filtered_p$fdr_adjusted_anova_p | |
| 2397 , | |
| 2398 FUN = function(x) { | |
| 2399 if (x > 0.01) | |
| 2400 paste0("%s (%0.", 1 + ceiling(-log10(x)), "f)") | |
| 2401 else | |
| 2402 paste0("%s (%0.2e)") | |
| 2403 } | |
| 2404 ) | |
| 2405 | |
| 2406 cat_hm_heading <- function(m, cutoff) { | 4778 cat_hm_heading <- function(m, cutoff) { |
| 2407 if (nrow(m) > intensity_hm_rows) { | 4779 if (nrow(m) > g_intensity_hm_rows) { |
| 2408 cat("\\newpage\n") | 4780 cat("\\clearpage\n") |
| 2409 subsection_header( | 4781 cat(subsection_header( |
| 2410 paste( | 4782 paste( |
| 2411 sprintf("Heatmap for the %d most-significant peptides", | 4783 sprintf("Heatmap for the %d most-significant peptides", |
| 2412 intensity_hm_rows), | 4784 g_intensity_hm_rows), |
| 2413 sprintf("whose adjusted p-value < %0.2f\n", cutoff) | 4785 sprintf("whose adjusted p-value < %0.2f\n", cutoff) |
| 2414 ) | 4786 ) |
| 2415 ) | 4787 )) |
| 2416 } else { | 4788 } else { |
| 2417 if (nrow(m) == 0) { | 4789 if (nrow(m) == 0) { |
| 2418 return(FALSE) | 4790 return(FALSE) |
| 2419 } else { | 4791 } else { |
| 2420 subsection_header( | 4792 cat(subsection_header( |
| 2421 paste( | 4793 paste( |
| 2422 sprintf("Heatmap for %d usable peptides whose", nrow(m)), | 4794 sprintf("Heatmap for %d usable peptide genes whose", nrow(m)), |
| 2423 sprintf("adjusted p-value < %0.2f\n", cutoff) | 4795 sprintf("adjusted p-value < %0.2f\n", cutoff) |
| 2424 ) | 4796 ) |
| 2425 ) | 4797 )) |
| 2426 } | 4798 } |
| 2427 } | 4799 } |
| 2428 cat("\n\n\n") | 4800 cat("\n\n\n") |
| 2429 cat("\n\n\n") | 4801 cat("\n\n\n") |
| 2430 return(TRUE) | 4802 return(TRUE) |
| 2431 } | 4803 } |
| 2432 | 4804 |
| 2433 # construct matrix with appropriate rownames | 4805 # construct matrix with appropriate rownames |
| 2434 m <- | 4806 m <- |
| 2435 as.matrix(unimputed_quant_data_log[anova_filtered_merge_order, ]) | 4807 as.matrix(unimputed_quant_data_log[anova_filtered_merge_order, ]) |
| 2436 if (nrow(m) > 0) { | 4808 nrow_m <- nrow(m) |
| 4809 ncol_m <- ncol(m) | |
| 4810 if (nrow_m > 0) { | |
| 2437 rownames_m <- rownames(m) | 4811 rownames_m <- rownames(m) |
| 2438 rownames(m) <- sapply( | 4812 q <- data.frame(pepname = rownames_m) |
| 2439 X = seq_len(nrow(m)) | 4813 g <- sqldf(" |
| 2440 , | 4814 SELECT q.pepname, substr(met.Gene_Name, 1, 30) AS gene_name |
| 4815 FROM q, metadata_plus_p AS met | |
| 4816 WHERE q.pepname = met.Phosphopeptide | |
| 4817 ORDER BY q.rowid | |
| 4818 ") | |
| 4819 tmp <- sapply( | |
| 4820 X = seq_len(nrow(g)), | |
| 4821 FUN = function(i) { | |
| 4822 pre <- strsplit(g$gene_name[i], "; ")[[1]] | |
| 4823 rslt <- paste(unique(pre), sep = "; ") | |
| 4824 return(rslt) | |
| 4825 } | |
| 4826 ) | |
| 4827 tmp <- unlist(tmp) | |
| 4828 tmp <- | |
| 4829 make.names(tmp, unique = TRUE) | |
| 4830 tmp <- sub( | |
| 4831 "No_Gene_Name", | |
| 4832 "not_found", | |
| 4833 tmp, | |
| 4834 fixed = TRUE | |
| 4835 ) | |
| 4836 ten_trunc_names <- trunc_ppep(rownames_m) | |
| 4837 tmp <- sapply( | |
| 4838 X = seq_len(nrow_m), | |
| 2441 FUN = function(i) { | 4839 FUN = function(i) { |
| 2442 sprintf( | 4840 sprintf( |
| 2443 anova_filtered_merge_format[i], | 4841 "(%s) %s", |
| 2444 rownames_m[i], | 4842 tmp[i], |
| 2445 signif(filtered_p$fdr_adjusted_anova_p[i], 2) | 4843 ten_trunc_names[i] |
| 2446 ) | 4844 ) |
| 2447 } | 4845 } |
| 2448 ) | 4846 ) |
| 4847 rownames(m) <- tmp | |
| 2449 } | 4848 } |
| 2450 # draw the heading and heatmap | 4849 # draw the heading and heatmap |
| 2451 if (nrow(m) > 0) { | 4850 if (nrow_m > 0) { |
| 2452 number_of_peptides_found <- | 4851 number_of_peptides_found <- |
| 2453 draw_ppep_heatmap( | 4852 ppep_heatmap( |
| 2454 m = m, | 4853 m = m, |
| 2455 cutoff = cutoff, | 4854 cutoff = cutoff, |
| 2456 hm_heading_function = cat_hm_heading, | 4855 hm_heading_function = cat_hm_heading, |
| 2457 hm_main_title = | 4856 hm_main_title = |
| 2458 "log(intensities), row-scaled, unimputed, unnormalized", | 4857 "log(intensities), row-scaled, unimputed, unnormalized", |
| 2459 suppress_row_dendrogram = FALSE | 4858 suppress_row_dendrogram = FALSE, |
| 4859 master_cex = 0.35, | |
| 4860 sepcolor = "black", | |
| 4861 colsep = sample_colsep | |
| 2460 ) | 4862 ) |
| 2461 if (number_of_peptides_found > 1) { | 4863 if (number_of_peptides_found > 1) { |
| 2462 cat("\\leavevmode\n") | 4864 cat("\\leavevmode\n") |
| 2463 cat("The adjusted ANOVA \\textit{p}-value is shown in parentheses | |
| 2464 after the phosphopeptide sequence.\n\n") | |
| 2465 } | 4865 } |
| 2466 } | 4866 } |
| 2467 } | 4867 } |
| 2468 } | 4868 } |
| 2469 } | 4869 } |
| 4870 cat(filter_result_string) | |
| 2470 cat("\\leavevmode\n") | 4871 cat("\\leavevmode\n") |
| 4872 | |
| 4873 if (!g_can_run_ksea) { | |
| 4874 errmsg <- paste("Cannot proceed with KSEA analysis", | |
| 4875 "because too many values are missing.") | |
| 4876 if (FALSE) cat0( | |
| 4877 errmsg, | |
| 4878 "\\stepcounter{offset}\n", | |
| 4879 "\\stepcounter{offset}\n", | |
| 4880 "\\stepcounter{offset}\n", | |
| 4881 " in ", | |
| 4882 table_href(), | |
| 4883 ".\n\n" | |
| 4884 ) | |
| 4885 if (FALSE) { | |
| 4886 if (print_nb_messages) nbe(see_variable(p_value_data)) | |
| 4887 } else { | |
| 4888 if (print_nb_messages) nbe(see_variable(p_value_data)) | |
| 4889 | |
| 4890 display_p_value_data <- p_value_data | |
| 4891 display_p_value_data$raw_anova_p <- | |
| 4892 sprintf("%0.3g", display_p_value_data$raw_anova_p) | |
| 4893 display_p_value_data$fdr_adjusted_anova_p <- | |
| 4894 sprintf("%0.3g", display_p_value_data$fdr_adjusted_anova_p) | |
| 4895 display_p_value_data$quality <- | |
| 4896 sprintf("%0.3g", display_p_value_data$quality) | |
| 4897 | |
| 4898 headers_1st_line <- | |
| 4899 c("", "Raw ANOVA", "FDR-adj.", "Missing", "Min. #", "", "") | |
| 4900 headers_2nd_line <- | |
| 4901 c("Phosphopeptide", "p-value", "p-value", "values", "group-obs", "Quality", "Ranking") | |
| 4902 data_frame_tabbing_latex( | |
| 4903 x = display_p_value_data, | |
| 4904 tabstops = c(2.75, 0.80, 0.80, 0.5, 0.6, 0.60), | |
| 4905 use_subsubsection_header = FALSE, | |
| 4906 headings = c(headers_1st_line, headers_2nd_line), | |
| 4907 caption = "ANOVA results" | |
| 4908 | |
| 4909 ) | |
| 4910 } | |
| 4911 data_frame_tabbing_latex( | |
| 4912 x = save_sample_treatment_df, | |
| 4913 tabstops = c(1.25, 1.25), | |
| 4914 caption = "Sample classes", | |
| 4915 use_subsubsection_header = FALSE | |
| 4916 ) | |
| 4917 param_df_exit() | |
| 4918 knitr::knit_exit() | |
| 4919 return(invisible(-1)) | |
| 4920 } | |
| 4921 | |
| 2471 ``` | 4922 ``` |
| 2472 | 4923 |
| 2473 ```{r sqlite, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} | 4924 ```{r sqlite, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} |
| 2474 | 4925 |
| 2475 if (count_of_treatment_levels > 1) { | 4926 if (g_can_run_ksea && count_of_treatment_levels > 1) { |
| 2476 # Prepare two-way contrasts with adjusted p-values | 4927 # Prepare two-way contrasts with adjusted p-values |
| 2477 # Strategy: | 4928 # Strategy: |
| 2478 # - use imputed, log-transformed data: | 4929 # - use imputed, log-transformed data: |
| 2479 # - remember this when computing log2(fold-change) | 4930 # - remember this when computing log2(fold-change) |
| 2480 # - each contrast is between a combination of trt levels | 4931 # - each contrast is between a combination of trt levels |
| 2486 # - adjust p-value, assuming that | 4937 # - adjust p-value, assuming that |
| 2487 # (# of pppeps)*(# of contrasts) tests were performed | 4938 # (# of pppeps)*(# of contrasts) tests were performed |
| 2488 | 4939 |
| 2489 # Each contrast is between a combination of trt levels | 4940 # Each contrast is between a combination of trt levels |
| 2490 m2 <- combn( | 4941 m2 <- combn( |
| 2491 x = seq_len(length(levels(sample_treatment_levels))), | 4942 x = seq_len(length(levels(smpl_trt))), |
| 2492 m = 2, | 4943 m = 2, |
| 2493 simplify = TRUE | 4944 simplify = TRUE |
| 2494 ) | 4945 ) |
| 2495 contrast_count <- ncol(m2) | 4946 contrast_count <- ncol(m2) |
| 2496 | 4947 |
| 2500 f_m2 <- | 4951 f_m2 <- |
| 2501 function(cntrst, lvl1, lvl2) { | 4952 function(cntrst, lvl1, lvl2) { |
| 2502 return( | 4953 return( |
| 2503 data.frame( | 4954 data.frame( |
| 2504 contrast = cntrst, | 4955 contrast = cntrst, |
| 2505 level = sample_treatment_levels[ | 4956 level = smpl_trt[ |
| 2506 sample_treatment_levels %in% | 4957 smpl_trt %in% |
| 2507 levels(sample_treatment_levels)[c(lvl1, lvl2)] | 4958 levels(smpl_trt)[c(lvl1, lvl2)] |
| 2508 ], | 4959 ], |
| 2509 label = sample_name_matches[ | 4960 label = sample_name_matches[ |
| 2510 sample_treatment_levels %in% | 4961 smpl_trt %in% |
| 2511 levels(sample_treatment_levels)[c(lvl1, lvl2)] | 4962 levels(smpl_trt)[c(lvl1, lvl2)] |
| 2512 ] | 4963 ] |
| 2513 ) | 4964 ) |
| 2514 ) | 4965 ) |
| 2515 } | 4966 } |
| 2516 # - compute a df for each contrast | 4967 # - compute a df for each contrast |
| 2686 ; | 5137 ; |
| 2687 " | 5138 " |
| 2688 ) | 5139 ) |
| 2689 | 5140 |
| 2690 # - create contrast-metadata table | 5141 # - create contrast-metadata table |
| 5142 if (print_nb_messages) nbe("CREATE TABLE contrast_lvl_lvl_metadata") | |
| 2691 dml_no_rows_exec(db, " | 5143 dml_no_rows_exec(db, " |
| 2692 CREATE TABLE contrast_lvl_lvl_metadata | 5144 CREATE TABLE contrast_lvl_lvl_metadata |
| 2693 AS | 5145 AS |
| 2694 SELECT DISTINCT | 5146 SELECT DISTINCT |
| 2695 a.contrast AS ab_contrast, | 5147 a.contrast AS ab_contrast, |
| 2796 rownames(grouping_factor) <- grouping_factor$sample | 5248 rownames(grouping_factor) <- grouping_factor$sample |
| 2797 grouping_factor <- grouping_factor[, "level", drop = FALSE] | 5249 grouping_factor <- grouping_factor[, "level", drop = FALSE] |
| 2798 | 5250 |
| 2799 # - run the two-level (one-way) test | 5251 # - run the two-level (one-way) test |
| 2800 p_value_data_contrast_ps <- | 5252 p_value_data_contrast_ps <- |
| 2801 apply( | 5253 row_apply( |
| 2802 X = contrast_cast_data, | 5254 x = contrast_cast_data, |
| 2803 MARGIN = 1, # apply to rows | 5255 fun = anova_func, |
| 2804 FUN = anova_func, | |
| 2805 grouping_factor = | 5256 grouping_factor = |
| 2806 as.factor(grouping_factor$level), # anova_func arg2 | 5257 as.factor(grouping_factor$level), # anova_func arg2 |
| 2807 one_way_f = one_way_two_categories, # anova_func arg3 | 5258 one_way_f = one_way_two_categories, # anova_func arg3 |
| 2808 simplify = TRUE # TRUE is the default for simplify | 5259 simplify = TRUE # TRUE is the default for simplify |
| 2809 ) | 5260 ) |
| 3013 AND NOT m.`Gene` = 'No_Gene_Name' | 5464 AND NOT m.`Gene` = 'No_Gene_Name' |
| 3014 AND NOT v.log2_fc = 0 | 5465 AND NOT v.log2_fc = 0 |
| 3015 ; | 5466 ; |
| 3016 " | 5467 " |
| 3017 ) | 5468 ) |
| 5469 # We are done with DDL and insertion | |
| 5470 RSQLite::dbDisconnect(db) | |
| 3018 } | 5471 } |
| 3019 ``` | 5472 ``` |
| 3020 | 5473 |
| 3021 ```{r echo = FALSE, results = 'asis'} | 5474 ```{r echo = FALSE, results = 'asis'} |
| 3022 cat("\\newpage\n") | 5475 cat("\\newpage\n") |
| 3023 ``` | 5476 ``` |
| 3024 | 5477 |
| 3025 # KSEA Analysis | 5478 # KSEA Analysis Summaries |
| 3026 | 5479 |
| 3027 Results of Kinase-Substrate Enrichment Analysis are presented here, if the substrates for any kinases are relatively enriched. Enrichments are found by the CRAN `KSEAapp` package: | 5480 Results of Kinase-Substrate Enrichment Analysis are presented here, if the substrates for any kinases are relatively enriched. Enrichments are found by the CRAN `KSEAapp` package: |
| 3028 | 5481 |
| 3029 - The package is available on CRAN, at https:/cran.r-project.org/package=KSEAapp | 5482 - The package is available on CRAN, at https:/cran.r-project.org/package=KSEAapp |
| 3030 - The method used is described in Casado et al. (2013) [doi:10.1126/scisignal.2003573](https:/doi.org/10.1126/scisignal.2003573) and Wiredja et al (2017) [doi:10.1093/bioinformatics/btx415](https:/doi.org/10.1093/bioinformatics/btx415). | 5483 - The method used is described in Casado et al. (2013) [doi:10.1126/scisignal.2003573](https:/doi.org/10.1126/scisignal.2003573) and Wiredja et al (2017) [doi:10.1093/bioinformatics/btx415](https:/doi.org/10.1093/bioinformatics/btx415). |
| 3031 - An online alternative (supporting only analysis of human data) is available at [https:/casecpb.shinyapps.io/ksea/](https:/casecpb.shinyapps.io/ksea/). | 5484 - An online alternative (supporting only analysis of human data) is available at [https:/casecpb.shinyapps.io/ksea/](https:/casecpb.shinyapps.io/ksea/). |
| 3032 | 5485 |
| 3033 For each kinase, $i$, and each two-way contrast of treatments, $j$, an enrichment $z$-score is computed as: | 5486 For each kinase, $i$, and each two-way contrast of treatments, $j$, an enrichment $z$-score is computed as: |
| 3034 | 5487 |
| 3035 $$ | 5488 $$ |
| 3036 \text{kinase enrichment score}_{j,i} = \frac{(\overline{s}_{j,i} - \overline{p}_j)\sqrt{m_{j,i}}}{\delta_j} | 5489 \text{kinase enrichment }z\text{-score}_{j,i} = \frac{(\overline{`r sfc`}_{j,i} - \overline{`r pfc`}_j)\sqrt{m_{j,i}}}{\delta_j} |
| 3037 $$ | 5490 $$ |
| 3038 | 5491 |
| 3039 and fold-enrichment is computed as: | 5492 and fold-enrichment is computed as: |
| 3040 | 5493 |
| 3041 $$ | 5494 $$ |
| 3042 \text{Enrichment}_{j,i} = \frac{\overline{s}_{j,i}}{\overline{p}_j} | 5495 \text{Enrichment}_{j,i} = \frac{\overline{`r sfc`}_{j,i}}{\overline{`r pfc`}_j} |
| 3043 $$ | 5496 $$ |
| 3044 | 5497 |
| 3045 where: | 5498 where: |
| 3046 | 5499 |
| 3047 - $\overline{s}_{j,i}$ is the mean $\log_2 (|\text{fold-change|})$ in intensities (for contrast $j$) of known substrates of the kinase $i$, | 5500 - $\overline{`r sfc`}_{j,i}$ is the mean `r pfc_txt` in intensities of known substrates of the kinase $i$ in contrast $j$, |
| 3048 - $\overline{p}_j$ is the mean $\log_2 (|\text{fold-change}|)$ of all phosphosites identified in contrast $j$, and | 5501 - $\overline{`r pfc`}_j$ is the mean `r pfc_txt` of all phosphosites identified in contrast $j$, and |
| 3049 - $m_{j,i}$ is the total number of phosphosite substrates of kinase $i$ identified in contrast $j$, | 5502 - $m_{j,i}$ is the total number of phosphosite substrates of kinase $i$ identified in contrast $j$, |
| 3050 - $\delta_j$ is the standard deviation of the $\log_2 (|\text{fold-change}|)$ for contrast $j$ across all phosphosites in the dataset. | 5503 - $\delta_j$ is the standard deviation of the $\log_2 (\text{fold-change})$ for contrast $j$ across all phosphosites in the dataset. |
| 3051 - Note that the absolute value of fold-change is used so that both increased and decreased substrates of a kinase will contribute to its enrichment score. | 5504 - Note that the absolute value of fold-change is used so that both increased and decreased substrates of a kinase will contribute to its enrichment score. |
| 3052 | 5505 |
| 3053 $\text{FDR}_{j,i}$ is computed from the $p$-value for the z-score using the R `stats::p.adjust` function, applying the False Discovery Rate correction from Benjamini and Hochberg (1995) [doi:10.1111/j.2517-6161.1995.tb02031.x](https:/doi.org/10.1111/j.2517-6161.1995.tb02031.x) | 5506 $\text{FDR}_{j,i}$ is the False Discovery Rate corrected kinase enrichment score. |
| 3054 | 5507 |
| 3055 Color intensity in heatmaps reflects magnitude of $z$-score for enrichment of respective kinase in respective contrast; hue reflects the sign of the $z$-score (blue, negative; red, positive). | 5508 Color intensity in heatmaps reflects magnitude of $z$-score for enrichment of respective kinase in respective contrast; hue reflects the sign of the $z$-score (blue, negative; red, positive). |
| 3056 | 5509 |
| 3057 Asterisks in heatmaps reflect enrichments that are significant at `r ksea_cutoff_statistic` < `r ksea_cutoff_threshold`. | 5510 Asterisks in heatmaps reflect enrichments that are significant at `r ksea_cutoff_statistic` < `r ksea_cutoff_threshold`. |
| 3058 | 5511 |
| 3059 - Kinase names are generally as presented at Phospho.ELM [http://phospho.elm.eu.org/kinases.html](http://phospho.elm.eu.org/kinases.html) (when available), although Phospho.ELM data are not yet incorporated into this analysis. | 5512 - Kinase names are generally as presented at Phospho.ELM [http://phospho.elm.eu.org/kinases.html](http://phospho.elm.eu.org/kinases.html) (when available), although Phospho.ELM data are not yet incorporated into this analysis. |
| 3060 - Kinase names having the suffix '(HPRD)' are as presented at [http://hprd.org/serine_motifs](http://hprd.org/serine_motifs) and [http://hprd.org/tyrosine_motifs](http://hprd.org/tyrosine_motifs) and are as originally reported in the Amanchy et al., 2007 (doi: [10.1038/nbt0307-285](https://doi.org/10.1038/nbt0307-285)). | 5513 - Kinase names having the suffix '(HPRD)' are as presented at [http://hprd.org/serine_motifs](http://hprd.org/serine_motifs) and [http://hprd.org/tyrosine_motifs](http://hprd.org/tyrosine_motifs) and are as originally reported in the Amanchy et al., 2007 (doi: [10.1038/nbt0307-285](https://doi.org/10.1038/nbt0307-285)). |
| 3061 - Kinase-strate deata were also taken from [http://networkin.science/download.shtml](http://networkin.science/download.shtml) and from PhosphoSitePlus [https://www.phosphosite.org/staticDownloads](https://www.phosphosite.org/staticDownloads). | 5514 - Kinase-substrate data were also taken from [http://networkin.science/download.shtml](http://networkin.science/download.shtml) and from PhosphoSitePlus [https://www.phosphosite.org/staticDownloads](https://www.phosphosite.org/staticDownloads). |
| 3062 | 5515 |
| 3063 ```{r ksea, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} | 5516 For each enriched kinase, a heatmap showing the intensities is presented for up to `r g_intensity_hm_rows` substrates, i.e., those substrates having the highest"quality". |
| 5517 | |
| 5518 Where possible, a heatmap of the correlations among these the selected substrates is also presented; if correlations cannot be computed (because of too many missing values), then the covariances are heatmapped for substrates having a variance greater than 1. | |
| 5519 | |
| 5520 ```{r ksea, echo = FALSE, fig.dim = c(12, 14.5), results = 'asis'} | |
| 5521 cat("\\clearpage\n") | |
| 3064 | 5522 |
| 3065 db <- RSQLite::dbConnect(RSQLite::SQLite(), ksea_app_prep_db) | 5523 db <- RSQLite::dbConnect(RSQLite::SQLite(), ksea_app_prep_db) |
| 3066 | 5524 |
| 3067 # -- eliminate the table that's about to be defined | 5525 # -- eliminate the table that's about to be defined |
| 3068 ddl_exec(db, " | 5526 ddl_exec(db, " |
| 3170 sub_title <- contrast_longlabel | 5628 sub_title <- contrast_longlabel |
| 3171 tryCatch( | 5629 tryCatch( |
| 3172 expr = { | 5630 expr = { |
| 3173 ksea_scores_rslt <- | 5631 ksea_scores_rslt <- |
| 3174 ksea_scores( | 5632 ksea_scores( |
| 3175 ksdata = pseudo_ksdata, # KSEAapp::KSData, | 5633 ksdata = pseudo_ksdata, |
| 3176 px = kseaapp_input, | 5634 px = kseaapp_input, |
| 3177 networkin = TRUE, | 5635 networkin = TRUE, |
| 3178 networkin_cutoff = 2 | 5636 networkin_cutoff = 2, |
| 5637 minimum_substrate_count = ksea_min_substrate_count | |
| 3179 ) | 5638 ) |
| 5639 | |
| 5640 if (FALSE) { | |
| 5641 ksea_scores_rslt <- | |
| 5642 ksea_scores_rslt[ | |
| 5643 ksea_scores_rslt$m >= ksea_min_substrate_count, | |
| 5644 , | |
| 5645 drop = FALSE | |
| 5646 ] | |
| 5647 } | |
| 5648 | |
| 5649 if (FALSE) { | |
| 5650 data_frame_tabbing_latex( | |
| 5651 x = ksea_scores_rslt, | |
| 5652 tabstops = c(0.8, 0.8, 0.8, 0.8, 0.8, 0.8), | |
| 5653 caption = paste("KSEA scores for contrast ", | |
| 5654 cntrst_b_level, "to", cntrst_a_level), | |
| 5655 use_subsubsection_header = FALSE | |
| 5656 ) | |
| 5657 } | |
| 5658 | |
| 5659 if (FALSE) { | |
| 5660 if (print_nb_messages) nbe("Output contents of `ksea_scores_rslt` table\n") | |
| 5661 cat_variable(ksea_scores_rslt) | |
| 5662 cat("\n\\clearpage\n") | |
| 5663 } | |
| 3180 | 5664 |
| 3181 if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) { | 5665 if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) { |
| 3182 next_index <- 1 + next_index | 5666 next_index <- 1 + next_index |
| 3183 rslt$score_list[[next_index]] <- ksea_scores_rslt | 5667 rslt$score_list[[next_index]] <- ksea_scores_rslt |
| 3184 rslt$name_list[[next_index]] <- contrast_label | 5668 rslt$name_list[[next_index]] <- contrast_label |
| 3185 rslt$longname_list[[next_index]] <- contrast_longlabel | 5669 rslt$longname_list[[next_index]] <- contrast_longlabel |
| 3186 low_fdr_print( | 5670 ksea_low_fdr_print( |
| 3187 rslt = rslt, | 5671 rslt = rslt, |
| 3188 i_cntrst = i_cntrst, | 5672 i_cntrst = i_cntrst, |
| 3189 i = next_index, | 5673 i = next_index, |
| 3190 a_level = cntrst_a_level, | 5674 a_level = cntrst_a_level, |
| 3191 b_level = cntrst_b_level, | 5675 b_level = cntrst_b_level, |
| 3192 fold_change = cntrst_fold_change, | 5676 fold_change = cntrst_fold_change, |
| 3193 caption = contrast_longlabel | 5677 caption = contrast_longlabel |
| 3194 ) | 5678 ) |
| 3195 } | 5679 } |
| 3196 }, | 5680 }, |
| 3197 error = function(e) str(e) | 5681 error = function(e) { |
| 5682 str(e) | |
| 5683 cat_margins() | |
| 5684 } | |
| 3198 ) | 5685 ) |
| 3199 } | 5686 } |
| 3200 | 5687 |
| 3201 plotted_kinases <- NULL | 5688 plotted_kinases <- NULL |
| 3202 if (length(rslt$score_list) > 1) { | 5689 if (g_can_run_ksea && length(rslt$score_list) > 1) { |
| 3203 for (i in seq_len(length(ksea_heatmap_titles))) { | 5690 for (i in seq_len(length(ksea_heatmap_titles))) { |
| 3204 hdr <- ksea_heatmap_titles[[i]] | 5691 hdr <- ksea_heatmap_titles[[i]] |
| 3205 which_kinases <- i | 5692 which_kinases <- i |
| 3206 | 5693 |
| 3207 cat("\\clearpage\n\\begin{center}\n") | 5694 cat("\\clearpage\n\\begin{center}\n") |
| 3208 if (i == const_ksea_astrsk_kinases) { | 5695 if (i == const_ksea_astrsk_kinases) { |
| 3209 subsection_header(hdr) | 5696 cat(subsection_header(hdr)) |
| 3210 } else { | 5697 } else { |
| 3211 subsection_header(hdr) | 5698 cat(subsection_header(hdr)) |
| 3212 } | 5699 } |
| 3213 cat("\\end{center}\n") | 5700 cat("\\end{center}\n") |
| 3214 | 5701 |
| 3215 plotted_kinases <- ksea_heatmap( | 5702 plotted_kinases <- ksea_heatmap( |
| 3216 # the data frame outputs from the KSEA.Scores() function, in list format | 5703 # the data frame outputs from the KSEA.Scores() function, in list format |
| 3225 # a numeric value between 0 and infinity indicating the min. number of | 5712 # a numeric value between 0 and infinity indicating the min. number of |
| 3226 # substrates a kinase must have to be included in the heatmap | 5713 # substrates a kinase must have to be included in the heatmap |
| 3227 m_cutoff = 1, | 5714 m_cutoff = 1, |
| 3228 # a numeric value between 0 and 1 indicating the p-value/FDR cutoff | 5715 # a numeric value between 0 and 1 indicating the p-value/FDR cutoff |
| 3229 # for indicating significant kinases in the heatmap | 5716 # for indicating significant kinases in the heatmap |
| 3230 p_cutoff = 0.05, | 5717 p_cutoff = params$kseaCutoffThreshold, |
| 3231 # a binary input of TRUE or FALSE, indicating whether or not to perform | 5718 # a binary input of TRUE or FALSE, indicating whether or not to perform |
| 3232 # hierarchical clustering of the sample columns | 5719 # hierarchical clustering of the sample columns |
| 3233 sample_cluster = TRUE, | 5720 sample_cluster = TRUE, |
| 3234 # a binary input of TRUE or FALSE, indicating whether or not to export | 5721 # a binary input of TRUE or FALSE, indicating whether or not to export |
| 3235 # the heatmap as a .png image into the working directory | 5722 # the heatmap as a .png image into the working directory |
| 3242 ylab = "Kinase", | 5729 ylab = "Kinase", |
| 3243 # print which kinases: | 5730 # print which kinases: |
| 3244 # - 1 : all kinases | 5731 # - 1 : all kinases |
| 3245 # - 2 : significant kinases | 5732 # - 2 : significant kinases |
| 3246 # - 3 : non-significant kinases | 5733 # - 3 : non-significant kinases |
| 3247 which_kinases = which_kinases | 5734 which_kinases = which_kinases, |
| 5735 margins = c(7, 15) | |
| 3248 ) | 5736 ) |
| 3249 if (!is.null(plotted_kinases)) { | 5737 if (!is.null(plotted_kinases)) { |
| 3250 cat("\\begin{center}\n") | 5738 cat("\\begin{center}\n") |
| 3251 cat("Color intensity reflects $z$-score magnitudes; hue reflects $z$-score sign.\n") | |
| 3252 if (which_kinases != const_ksea_nonastrsk_kinases) | 5739 if (which_kinases != const_ksea_nonastrsk_kinases) |
| 3253 cat("Asterisks reflect significance.\n") | 5740 cat("Asterisks reflect significance.\n") |
| 3254 cat("\\end{center}\n") | 5741 cat("\\end{center}\n") |
| 3255 } | 5742 } |
| 3256 } # end for (i in ... | 5743 } # end for (i in ... |
| 3257 } # end if (length ... | 5744 } # end if (length ... |
| 3258 | 5745 ``` |
| 3259 for (i_cntrst in seq_len(length(rslt$score_list))) { | 5746 |
| 3260 next_index <- i_cntrst | 5747 ```{r kseabar_calc, echo = FALSE, fig.dim = c(9.5, 6), results = 'asis'} |
| 3261 cntrst_a_level <- contrast_metadata_df[i_cntrst, "a_level"] | 5748 ksea_prints <- list() |
| 3262 cntrst_b_level <- contrast_metadata_df[i_cntrst, "b_level"] | 5749 ksea_barplots <- list() |
| 3263 cntrst_fold_change <- contrast_metadata_df[i_cntrst, 6] | 5750 for (i_cntrst in seq_len(length(rslt$score_list))) { |
| 3264 contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level) | 5751 next_index <- i_cntrst |
| 3265 contrast_longlabel <- ( | 5752 cntrst_a_level <- contrast_metadata_df[i_cntrst, "a_level"] |
| 5753 cntrst_b_level <- contrast_metadata_df[i_cntrst, "b_level"] | |
| 5754 cntrst_fold_change <- contrast_metadata_df[i_cntrst, 6] | |
| 5755 contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level) | |
| 5756 contrast_longlabel <- ( | |
| 5757 sprintf( | |
| 5758 "Class %s -> Class %s", | |
| 5759 contrast_metadata_df[i_cntrst, "b_level"], | |
| 5760 contrast_metadata_df[i_cntrst, "a_level"] | |
| 5761 ) | |
| 5762 ) | |
| 5763 main_title <- ( | |
| 5764 sprintf( | |
| 5765 "Change from treatment %s to treatment %s", | |
| 5766 contrast_metadata_df[i_cntrst, "b_level"], | |
| 5767 contrast_metadata_df[i_cntrst, "a_level"] | |
| 5768 ) | |
| 5769 ) | |
| 5770 sub_title <- contrast_longlabel | |
| 5771 tryCatch( | |
| 5772 expr = { | |
| 5773 ksea_scores_rslt <- rslt$score_list[[next_index]] | |
| 5774 if (print_nb_messages) nbe(see_variable(ksea_scores_rslt)) #ACE | |
| 5775 | |
| 5776 if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) { | |
| 5777 sink(deferred <- file()) | |
| 5778 ksea_low_fdr_print( | |
| 5779 rslt = rslt, | |
| 5780 i_cntrst = i_cntrst, | |
| 5781 i = next_index, | |
| 5782 a_level = cntrst_a_level, | |
| 5783 b_level = cntrst_b_level, | |
| 5784 fold_change = cntrst_fold_change, | |
| 5785 caption = contrast_longlabel, | |
| 5786 write_db = FALSE, | |
| 5787 anchor = const_table_anchor_t | |
| 5788 ) | |
| 5789 cat("\n") | |
| 5790 sink() | |
| 5791 lines <- | |
| 5792 paste( | |
| 5793 readLines(deferred, warn = FALSE), | |
| 5794 collapse = "\n" | |
| 5795 ) | |
| 5796 close(deferred) | |
| 5797 sq_put(ksea_prints, lines) | |
| 5798 sink(stderr()) | |
| 5799 cat("\n---\n") | |
| 5800 cat_variable(ksea_prints) | |
| 5801 barplot_closure <- | |
| 5802 ksea_low_fdr_barplot_factory( | |
| 5803 rslt = rslt, | |
| 5804 i_cntrst = i_cntrst, | |
| 5805 i = next_index, | |
| 5806 a_level = cntrst_a_level, | |
| 5807 b_level = cntrst_b_level, | |
| 5808 fold_change = cntrst_fold_change, | |
| 5809 caption = contrast_longlabel | |
| 5810 ) | |
| 5811 if (rlang::is_closure(barplot_closure)) | |
| 5812 sq_put(ksea_barplots, barplot_closure) | |
| 5813 else | |
| 5814 sq_put(ksea_barplots, no_op) | |
| 5815 str(ksea_barplots) | |
| 5816 cat("\n...\n") | |
| 5817 sink() | |
| 5818 } | |
| 5819 }, | |
| 5820 error = function(e) { | |
| 5821 str(e) | |
| 5822 cat_margins() | |
| 5823 } | |
| 5824 ) | |
| 5825 } | |
| 5826 ``` | |
| 5827 | |
| 5828 ```{r phosphoelm_kinase_upid_desc, echo = FALSE, fig.dim = c(12, 13.7), results = 'asis'} | |
| 5829 | |
| 5830 have_kinase_descriptions <- | |
| 5831 if (!is.null(bzip2df(kinase_uprt_desc_lut, kinase_uprt_desc_lut_bz2)) && | |
| 5832 !is.null(bzip2df(kinase_name_uprt_lut, kinase_name_uprt_lut_bz2)) | |
| 5833 ) { | |
| 5834 rownames(kinase_uprt_desc_lut) <- kinase_uprt_desc_lut$UniProtID | |
| 5835 kinase_name_to_desc_uprt <- function(s) { | |
| 5836 rslt <- NULL | |
| 5837 tryCatch( | |
| 5838 { | |
| 5839 which_rows <- eval(s == kinase_name_uprt_lut$kinase) | |
| 5840 kinase_uprtid <- | |
| 5841 kinase_name_uprt_lut[which_rows, 2] | |
| 5842 # filter for first _HUMAN match if any | |
| 5843 grepl_human <- grepl("_HUMAN$", kinase_uprtid) | |
| 5844 if (0 < sum(grepl_human)) | |
| 5845 kinase_uprtid <- kinase_uprtid[grepl_human] | |
| 5846 # filter for first match if any | |
| 5847 if (0 < length(kinase_uprtid)) { | |
| 5848 kinase_uprtid <- kinase_uprtid[1] | |
| 5849 kinase_desc <- kinase_uprt_desc_lut[kinase_uprtid, 2] | |
| 5850 if (!is.na(kinase_desc)) | |
| 5851 rslt <- c(kinase_desc, kinase_uprtid) | |
| 5852 else | |
| 5853 rslt <- c(kinase_desc, "") | |
| 5854 } | |
| 5855 }, | |
| 5856 warning = str | |
| 5857 ) | |
| 5858 rslt | |
| 5859 } | |
| 5860 TRUE | |
| 5861 } else { | |
| 5862 kinase_name_to_desc_uprt <- function(s) NULL | |
| 5863 FALSE | |
| 5864 } | |
| 5865 ``` | |
| 5866 | |
| 5867 ```{r write_params, echo = FALSE, results = 'asis'} | |
| 5868 # perhaps this should be moved into the functions section, eventually ... | |
| 5869 write_params <- function(db) { | |
| 5870 # write parameters to report | |
| 5871 | |
| 5872 # write parameters to SQLite output | |
| 5873 | |
| 5874 mqppep_anova_script_param_df <- data.frame( | |
| 5875 script = "mqppep_anova_script.Rmd", | |
| 5876 parameter = names(param_unlist), | |
| 5877 value = param_unlist | |
| 5878 ) | |
| 5879 ddl_exec(db, " | |
| 5880 DROP TABLE IF EXISTS script_parameter; | |
| 5881 " | |
| 5882 ) | |
| 5883 ddl_exec(db, " | |
| 5884 CREATE TABLE IF NOT EXISTS script_parameter( | |
| 5885 script TEXT, | |
| 5886 parameter TEXT, | |
| 5887 value ANY, | |
| 5888 UNIQUE (script, parameter) ON CONFLICT REPLACE | |
| 5889 ) | |
| 5890 ; | |
| 5891 " | |
| 5892 ) | |
| 5893 RSQLite::dbWriteTable( | |
| 5894 conn = db, | |
| 5895 name = "script_parameter", | |
| 5896 value = mqppep_anova_script_param_df, | |
| 5897 append = TRUE | |
| 5898 ) | |
| 5899 | |
| 5900 loaded_packages_df <- sessioninfo::package_info("loaded") | |
| 5901 loaded_packages_df[, "library"] <- as.character(loaded_packages_df$library) | |
| 5902 loaded_packages_df <- data.frame( | |
| 5903 package = loaded_packages_df$package, | |
| 5904 version = loaded_packages_df$loadedversion, | |
| 5905 date = loaded_packages_df$date | |
| 5906 ) | |
| 5907 #ACE cat("\\clearpage\n\\section{R package versions}\n") | |
| 5908 #ACE data_frame_tabbing_latex( | |
| 5909 #ACE x = loaded_packages_df, | |
| 5910 #ACE tabstops = c(2.5, 1.25), | |
| 5911 #ACE caption = "R package versions" | |
| 5912 #ACE ) | |
| 5913 cat("\\clearpage\n\\section{Input parameter settings}\n") | |
| 5914 data_frame_tabbing_latex( | |
| 5915 x = param_df, | |
| 5916 tabstops = c(1.75), | |
| 5917 underscore_whack = TRUE, | |
| 5918 caption = "Input parameters", | |
| 5919 verbatim = FALSE | |
| 5920 ) | |
| 5921 } | |
| 5922 | |
| 5923 if (!have_kinase_descriptions) { | |
| 5924 write_params(db) | |
| 5925 # We are done with output | |
| 5926 RSQLite::dbDisconnect(db) | |
| 5927 param_df_exit() | |
| 5928 knitr::knit_exit() | |
| 5929 } | |
| 5930 ``` | |
| 5931 | |
| 5932 ```{r kseabar, echo = FALSE, fig.dim = c(9.5, 12.3), results = 'asis'} | |
| 5933 if (have_kinase_descriptions) { | |
| 5934 my_section_header <- | |
| 3266 sprintf( | 5935 sprintf( |
| 3267 "Class %s -> Class %s", | 5936 "inases whose KSEA %s < %s\n", |
| 3268 contrast_metadata_df[i_cntrst, "b_level"], | 5937 ksea_cutoff_statistic, |
| 3269 contrast_metadata_df[i_cntrst, "a_level"] | 5938 signif(ksea_cutoff_threshold, 2) |
| 3270 ) | 5939 ) |
| 3271 ) | 5940 |
| 3272 main_title <- ( | 5941 # Use enriched kinases to find enriched kinase-substrate pairs |
| 5942 enriched_kinases <- data.frame(kinase = ls(ksea_asterisk_hash)) | |
| 5943 | |
| 5944 enriched_kinase_descs <- | |
| 5945 Reduce( | |
| 5946 f = function(l, r) { | |
| 5947 lkup <- kinase_name_to_desc_uprt(r) | |
| 5948 if (is.null(lkup)) l | |
| 5949 else r2 <- rbind( | |
| 5950 l, | |
| 5951 data.frame( | |
| 5952 kinase = r, | |
| 5953 uniprot_id = lkup[2], | |
| 5954 description = lkup[1] | |
| 5955 ) | |
| 5956 ) | |
| 5957 }, | |
| 5958 x = enriched_kinases$kinase, | |
| 5959 init = NULL | |
| 5960 ) | |
| 5961 | |
| 5962 if (length(enriched_kinase_descs) > 0 && nrow(enriched_kinase_descs) > 0) { | |
| 5963 cat("\n\\clearpage\n") | |
| 5964 data_frame_tabbing_latex( | |
| 5965 x = enriched_kinase_descs, | |
| 5966 tabstops = c(0.9, 1.3), | |
| 5967 headings = c("Kinase", "UniProt ID", "Description"), | |
| 5968 caption = paste0("Descriptions of k", my_section_header) | |
| 5969 ) | |
| 5970 } | |
| 5971 | |
| 5972 if (FALSE) { | |
| 5973 cat_variable(sqldf("SELECT kinase FROM enriched_kinases")) | |
| 5974 cat_variable(sqldf(" | |
| 5975 SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep | |
| 5976 FROM pseudo_ksdata | |
| 5977 WHERE gene IN (SELECT kinase FROM enriched_kinases) | |
| 5978 ")) | |
| 5979 data_frame_table_latex( | |
| 5980 x = sqldf(" | |
| 5981 SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep | |
| 5982 FROM pseudo_ksdata | |
| 5983 WHERE gene IN (SELECT kinase FROM enriched_kinases) | |
| 5984 "), | |
| 5985 justification = "l l l", | |
| 5986 centered = TRUE, | |
| 5987 caption = "substrates of enriched kinases", | |
| 5988 anchor = c(const_table_anchor_p, const_table_anchor_t), | |
| 5989 underscore_whack = TRUE | |
| 5990 ) | |
| 5991 data_frame_table_latex( | |
| 5992 x = sqldf(" | |
| 5993 SELECT | |
| 5994 gene AS kinase, | |
| 5995 ppep, | |
| 5996 sub_gene, | |
| 5997 '('||group_concat(gene||'-'||sub_gene)||') '||ppep AS label, | |
| 5998 fdr_adjusted_anova_p, | |
| 5999 quality, | |
| 6000 min_group_obs_count | |
| 6001 FROM ( | |
| 6002 SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep | |
| 6003 FROM pseudo_ksdata | |
| 6004 WHERE gene IN (SELECT kinase FROM enriched_kinases) | |
| 6005 ), | |
| 6006 p_value_data | |
| 6007 WHERE ppep = phosphopeptide | |
| 6008 GROUP BY kinase, ppep | |
| 6009 ORDER BY kinase, ppep, p_value_data.quality DESC | |
| 6010 "), | |
| 6011 justification = "l l l l l l l", | |
| 6012 centered = TRUE, | |
| 6013 caption = "labeled substrates of enriched kinases", | |
| 6014 anchor = c(const_table_anchor_p, const_table_anchor_t), | |
| 6015 underscore_whack = TRUE | |
| 6016 ) | |
| 6017 } | |
| 6018 all_enriched_substrates <- sqldf(" | |
| 6019 SELECT | |
| 6020 gene AS kinase, | |
| 6021 ppep, | |
| 6022 sub_gene, | |
| 6023 '('||group_concat(gene||'-'||sub_gene)||') '||ppep AS label, | |
| 6024 fdr_adjusted_anova_p, | |
| 6025 quality, | |
| 6026 min_group_obs_count | |
| 6027 FROM ( | |
| 6028 SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep | |
| 6029 FROM pseudo_ksdata | |
| 6030 WHERE gene IN (SELECT kinase FROM enriched_kinases) | |
| 6031 ), | |
| 6032 p_value_data | |
| 6033 WHERE ppep = phosphopeptide | |
| 6034 GROUP BY kinase, ppep | |
| 6035 ORDER BY kinase, ppep, p_value_data.quality DESC | |
| 6036 ") | |
| 6037 | |
| 6038 all_enriched_substrates <- | |
| 6039 all_enriched_substrates[ | |
| 6040 all_enriched_substrates$quality >= params$minQuality, | |
| 6041 , | |
| 6042 drop = FALSE | |
| 6043 ] | |
| 6044 | |
| 6045 all_enriched_substrates$sub_gene <- | |
| 6046 sub( | |
| 6047 " ///.*", | |
| 6048 " ...", | |
| 6049 all_enriched_substrates$sub_gene | |
| 6050 ) | |
| 6051 | |
| 6052 all_enriched_substrates$label <- | |
| 6053 with( | |
| 6054 all_enriched_substrates, | |
| 3273 sprintf( | 6055 sprintf( |
| 3274 "Change from treatment %s to treatment %s", | 6056 "(%s-%s) %s", |
| 3275 contrast_metadata_df[i_cntrst, "b_level"], | 6057 kinase, |
| 3276 contrast_metadata_df[i_cntrst, "a_level"] | 6058 trunc_subgene(sub_gene), |
| 3277 ) | 6059 ppep |
| 3278 ) | 6060 ) |
| 3279 sub_title <- contrast_longlabel | 6061 ) |
| 3280 tryCatch( | 6062 |
| 3281 expr = { | 6063 # this global is set to TRUE by cat_enriched_heading immediately below |
| 3282 ksea_scores_rslt <- rslt$score_list[[next_index]] | 6064 g_neednewpage <- FALSE |
| 3283 | 6065 |
| 3284 if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) { | 6066 # helper used to label per-kinase substrate enrichment figure |
| 3285 low_fdr_barplot( | 6067 cat_enriched_heading <- function(m, cut_args) { |
| 3286 rslt = rslt, | 6068 cutoff <- cut_args$cutoff |
| 3287 i_cntrst = i_cntrst, | 6069 kinase <- cut_args$kinase |
| 3288 i = next_index, | 6070 if (g_neednewpage) cat("\\newpage\n") |
| 3289 a_level = cntrst_a_level, | 6071 g_neednewpage <- TRUE |
| 3290 b_level = cntrst_b_level, | 6072 if (nrow(m) > g_intensity_hm_rows) { |
| 3291 fold_change = cntrst_fold_change, | 6073 cat(subsection_header( |
| 3292 caption = contrast_longlabel | |
| 3293 ) | |
| 3294 } | |
| 3295 }, | |
| 3296 error = function(e) str(e) | |
| 3297 ) | |
| 3298 } | |
| 3299 ``` | |
| 3300 | |
| 3301 ```{r enriched, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} | |
| 3302 | |
| 3303 # Use enriched kinases to find enriched kinase-substrate pairs | |
| 3304 enriched_kinases <- data.frame(kinase = ls(ksea_asterisk_hash)) | |
| 3305 all_enriched_substrates <- sqldf(" | |
| 3306 SELECT | |
| 3307 gene AS kinase, | |
| 3308 ppep, | |
| 3309 sub_gene, | |
| 3310 '('||group_concat(gene||'-'||sub_gene)||') '||ppep AS label, | |
| 3311 fdr_adjusted_anova_p | |
| 3312 FROM ( | |
| 3313 SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep | |
| 3314 FROM pseudo_ksdata | |
| 3315 WHERE gene IN (SELECT kinase FROM enriched_kinases) | |
| 3316 ), | |
| 3317 p_value_data | |
| 3318 WHERE ppep = phosphopeptide | |
| 3319 GROUP BY ppep | |
| 3320 ORDER BY fdr_adjusted_anova_p | |
| 3321 ") | |
| 3322 | |
| 3323 # helper used to label per-kinase substrate enrichment figure | |
| 3324 cat_enriched_heading <- function(m, cut_args) { | |
| 3325 cutoff <- cut_args$cutoff | |
| 3326 kinase <- cut_args$kinase | |
| 3327 statistic <- cut_args$statistic | |
| 3328 threshold <- cut_args$threshold | |
| 3329 cat("\\newpage\n") | |
| 3330 if (nrow(m) > intensity_hm_rows) { | |
| 3331 subsection_header( | |
| 3332 paste( | |
| 3333 sprintf( | 6074 sprintf( |
| 3334 "Lowest p-valued %d (of %d) enriched %s-substrates,", | 6075 "Highest-quality %d (of %d) enriched %s-substrates", |
| 3335 intensity_hm_rows, | 6076 g_intensity_hm_rows, |
| 3336 nrow(m), | 6077 nrow(m), |
| 3337 kinase | 6078 kinase |
| 3338 ), | 6079 ) |
| 3339 sprintf(" KSEA %s < %0.2f\n", statistic, threshold) | 6080 )) |
| 3340 ) | |
| 3341 ) | |
| 3342 } else { | |
| 3343 if (nrow(m) == 0) { | |
| 3344 return(FALSE) | |
| 3345 } else { | 6081 } else { |
| 3346 subsection_header( | 6082 if (nrow(m) == 0) { |
| 3347 paste( | 6083 return(FALSE) |
| 6084 } else { | |
| 6085 nrow_m <- nrow(m) | |
| 6086 cat(subsection_header( | |
| 3348 sprintf( | 6087 sprintf( |
| 3349 "%d enriched %s-substrates,", | 6088 "%d enriched %s-substrate%s", |
| 3350 nrow(m), | 6089 nrow_m, |
| 3351 kinase | 6090 kinase, |
| 3352 ), | 6091 if (nrow_m > 1) "s" else "" |
| 3353 sprintf( | |
| 3354 " KSEA %s < %0.2f\n", | |
| 3355 statistic, | |
| 3356 threshold | |
| 3357 ) | 6092 ) |
| 6093 )) | |
| 6094 } | |
| 6095 } | |
| 6096 cat("\n\n\n") | |
| 6097 cat("\n\n\n") | |
| 6098 return(TRUE) | |
| 6099 } | |
| 6100 | |
| 6101 # -------------------------------- | |
| 6102 # hack begin - show all substrates | |
| 6103 enriched_substrates <- all_enriched_substrates | |
| 6104 # add "FALSE &&" to prevent listing of substrates | |
| 6105 if (show_enriched_substrates && nrow(enriched_substrates) > 0) { | |
| 6106 short_row_names <- sub( | |
| 6107 "$FAILED_MATCH_GENE_NAME", | |
| 6108 "not_found", | |
| 6109 enriched_substrates$sub_gene, | |
| 6110 fixed = TRUE | |
| 6111 ) | |
| 6112 | |
| 6113 if (print_nb_messages) nbe(see_variable(enriched_substrates)) | |
| 6114 substrates_df <- with( | |
| 6115 enriched_substrates, | |
| 6116 data.frame( | |
| 6117 kinase = kinase, | |
| 6118 substrate = sub(" ///*", "...", short_row_names), | |
| 6119 anova_p_value = signif(fdr_adjusted_anova_p, 2), | |
| 6120 min_group_obs_count = signif(min_group_obs_count, 0), | |
| 6121 quality = signif(quality, 3), | |
| 6122 sequence = trunc_n(30)(ppep) | |
| 3358 ) | 6123 ) |
| 3359 ) | 6124 ) |
| 3360 } | 6125 |
| 3361 } | 6126 substrates_df <- substrates_df[ |
| 3362 cat("\n\n\n") | 6127 with(substrates_df, order(kinase, -quality)), |
| 3363 cat("\n\n\n") | 6128 , |
| 3364 return(TRUE) | 6129 drop = FALSE |
| 3365 } | 6130 ] |
| 3366 | 6131 |
| 3367 # Disabling heatmaps for substrates pending decision whether to eliminate them altogether | 6132 if (print_nb_messages) nbe(see_variable(substrates_df)) |
| 3368 if (TRUE) | 6133 if (nrow(substrates_df) < 1) |
| 6134 substrates_df$sequence <- c() | |
| 6135 if (print_nb_messages) nbe(see_variable(substrates_df)) | |
| 6136 names(substrates_df) <- headers_2nd_line <- | |
| 6137 c("Kinase", "Substrate", "p-value", "per group)", "quality", "Sequence") | |
| 6138 headers_1st_line <- c("", "", "ANOVA", "min(values", "", "") | |
| 6139 data_frame_tabbing_latex( | |
| 6140 x = substrates_df, | |
| 6141 tabstops = c(1.2, 0.8, 0.5, 0.65, 0.5), | |
| 6142 headings = c(headers_1st_line, headers_2nd_line), | |
| 6143 caption = "Details for all enriched substrates of enriched kinases" | |
| 6144 ) | |
| 6145 rm( | |
| 6146 enriched_substrates, | |
| 6147 substrates_df, | |
| 6148 short_row_names, | |
| 6149 headers_1st_line, | |
| 6150 headers_2nd_line | |
| 6151 ) | |
| 6152 } | |
| 6153 cat("\\clearpage\n") | |
| 6154 # hack end - show all substrates | |
| 6155 # -------------------------------- | |
| 6156 | |
| 6157 # print deferred tables and graphs for kinases from contrasts | |
| 6158 for (i_cntrst in seq_len(length(ksea_prints))) { | |
| 6159 #latex_samepage({ | |
| 6160 cat(ksea_prints[[i_cntrst]]) | |
| 6161 cat("\n") | |
| 6162 ksea_barplots[[i_cntrst]]() | |
| 6163 cat("\n") | |
| 6164 cat("\\clearpage\n") | |
| 6165 #}) | |
| 6166 } | |
| 6167 | |
| 6168 } | |
| 6169 ``` | |
| 6170 | |
| 6171 ```{r enriched, echo = FALSE, fig.dim = c(12, 13.7), results = 'asis'} | |
| 6172 if (g_can_run_ksea) { | |
| 6173 g_did_enriched_header <- FALSE | |
| 3369 for (kinase_name in sort(enriched_kinases$kinase)) { | 6174 for (kinase_name in sort(enriched_kinases$kinase)) { |
| 3370 enriched_substrates <- | 6175 enriched_substrates <- |
| 3371 all_enriched_substrates[ | 6176 all_enriched_substrates[ |
| 3372 all_enriched_substrates$kinase == kinase_name, | 6177 all_enriched_substrates$kinase == kinase_name, |
| 3373 , | 6178 , |
| 3374 drop = FALSE | 6179 drop = FALSE |
| 3375 ] | 6180 ] |
| 6181 ten_trunc_ppep <- trunc_enriched_substrate(enriched_substrates$ppep) | |
| 3376 enriched_substrates$label <- with( | 6182 enriched_substrates$label <- with( |
| 3377 enriched_substrates, | 6183 enriched_substrates, |
| 3378 sprintf( | 6184 sprintf( |
| 3379 "(%s-%s) %s (%0.2g)", | 6185 "(%s) %s", |
| 3380 kinase, | 6186 make.names( |
| 3381 sub("$FAILED_MATCH_GENE_NAME", "unidentified", sub_gene, fixed = TRUE), | 6187 sub("$FAILED_MATCH_GENE_NAME", "not_found", sub_gene, fixed = TRUE), |
| 3382 ppep, | 6188 unique = TRUE |
| 3383 fdr_adjusted_anova_p | 6189 ), |
| 6190 ten_trunc_ppep | |
| 3384 ) | 6191 ) |
| 3385 ) | 6192 ) |
| 3386 # Get the intensity values for the heatmap | 6193 # Get the intensity values for the heatmap |
| 3387 enriched_intensities <- | 6194 enriched_intensities <- |
| 3388 as.matrix(unimputed_quant_data_log[enriched_substrates$ppep, , drop = FALSE]) | 6195 as.matrix(unimputed_quant_data_log[enriched_substrates$ppep, , drop = FALSE]) |
| 6196 | |
| 3389 # Remove rows having too many NA values to be relevant | 6197 # Remove rows having too many NA values to be relevant |
| 6198 good_rows <- (rowSums(enriched_intensities, na.rm = TRUE) != 0) | |
| 6199 #ACE nbe(see_variable(good_rows), "\n") | |
| 6200 enriched_substrates <- enriched_substrates[good_rows, , drop = FALSE] | |
| 6201 enriched_intensities <- enriched_intensities[good_rows, , drop = FALSE] | |
| 6202 | |
| 3390 # Rename the rows with the display-name for the heatmap | 6203 # Rename the rows with the display-name for the heatmap |
| 3391 rownames(enriched_intensities) <- | 6204 short_row_names <- sub( |
| 6205 "$FAILED_MATCH_GENE_NAME", | |
| 6206 "not_found", | |
| 6207 enriched_substrates$sub_gene, | |
| 6208 fixed = TRUE | |
| 6209 ) | |
| 6210 short_row_names <- | |
| 6211 make.names(short_row_names, unique = TRUE) | |
| 6212 long_row_names <- | |
| 3392 sapply( | 6213 sapply( |
| 3393 X = rownames(enriched_intensities), | 6214 X = rownames(enriched_intensities), |
| 3394 FUN = function(rn) { | 6215 FUN = function(rn) { |
| 3395 enriched_substrates[enriched_substrates$ppep == rn, "label"] | 6216 enriched_substrates[enriched_substrates$ppep == rn, "label"] |
| 3396 } | 6217 } |
| 3397 ) | 6218 ) |
| 6219 rownames(enriched_intensities) <- long_row_names | |
| 3398 # Format as matrix for heatmap | 6220 # Format as matrix for heatmap |
| 3399 m <- as.matrix(enriched_intensities) | 6221 m <- as.matrix(enriched_intensities) |
| 6222 rownames(m) <- trunc_enriched_substrate(rownames(m)) | |
| 6223 | |
| 6224 #ACE nb("m with bad rows: ", see_variable(m), "\n") | |
| 6225 #ACE good_rows <- (rowSums(m, na.rm = TRUE) != 0) | |
| 6226 #ACE nb(see_variable(good_rows), "\n") | |
| 6227 #ACE m <- m[good_rows, , drop = FALSE] | |
| 6228 #ACE nb("m without(?) bad rows: ", see_variable(m), "\n") | |
| 6229 #ACE nb(see_variable(short_row_names), "\n") | |
| 6230 #ACE local_short_row_names <- short_row_names[good_rows] | |
| 6231 #ACE local_long_row_names <- long_row_names[good_rows] | |
| 6232 #ACE local_enriched_intensities <- enriched_intensities[local_long_row_names, ] | |
| 6233 | |
| 3400 # Draw the heading and heatmap | 6234 # Draw the heading and heatmap |
| 3401 if (nrow(m) > 0) { | 6235 nrow_m <- nrow(m) |
| 6236 if (nrow_m > 0) { | |
| 6237 if (!g_did_enriched_header) { | |
| 6238 cat("\n\\clearpage\n") | |
| 6239 cat(section_header(paste0("K", my_section_header))) | |
| 6240 g_did_enriched_header <- TRUE | |
| 6241 } | |
| 6242 is_na_m <- is.na(m) | |
| 6243 cellnote_m <- is_na_m | |
| 6244 cellnote_m[!is_na_m] <- "" | |
| 6245 cellnote_m[is_na_m] <- "NA" | |
| 3402 cut_args <- new_env() | 6246 cut_args <- new_env() |
| 3403 cut_args$cutoff <- cutoff | 6247 cut_args$cutoff <- cutoff |
| 3404 cut_args$kinase <- kinase_name | 6248 cut_args$kinase <- kinase_name |
| 3405 cut_args$statistic <- ksea_cutoff_statistic | 6249 cut_args$statistic <- ksea_cutoff_statistic |
| 3406 cut_args$threshold <- ksea_cutoff_threshold | 6250 cut_args$threshold <- ksea_cutoff_threshold |
| 3407 number_of_peptides_found <- | 6251 number_of_peptides_found <- |
| 3408 draw_ppep_heatmap( | 6252 ppep_heatmap( |
| 3409 m = m, | 6253 m = m, |
| 6254 cellnote = cellnote_m, | |
| 3410 cutoff = cut_args, | 6255 cutoff = cut_args, |
| 3411 hm_heading_function = cat_enriched_heading, | 6256 hm_heading_function = cat_enriched_heading, |
| 3412 hm_main_title | 6257 hm_main_title |
| 3413 = "Unnormalized (zero-imputed) intensities of enriched kinase-substrates", | 6258 = "Unnormalized (zero-imputed) intensities of enriched kinase-substrates", |
| 3414 suppress_row_dendrogram = FALSE | 6259 suppress_row_dendrogram = FALSE, |
| 6260 master_cex = 0.35, | |
| 6261 sepcolor = "black", | |
| 6262 colsep = sample_colsep | |
| 3415 ) | 6263 ) |
| 3416 if (number_of_peptides_found > 1) { | 6264 if (number_of_peptides_found > 1) { |
| 3417 cat("\\leavevmode\n") | 6265 |
| 3418 cat("The kinase-subsrate pair is shown in parentheses | 6266 tryCatch( |
| 3419 before the phosphopeptide sequence.\n\n") | 6267 { |
| 3420 cat("The adjusted ANOVA \\textit{p}-value is shown in parentheses | 6268 rownames(m) <- short_row_names |
| 3421 after the phosphopeptide sequence.\n\n") | 6269 cov_heatmap(m, nrow_m > g_intensity_hm_rows) |
| 6270 }, | |
| 6271 error = function(e) { | |
| 6272 cat( | |
| 6273 sprintf( | |
| 6274 "ERROR: %s\n\\newline\n", | |
| 6275 mget("e") | |
| 6276 ) | |
| 6277 ) | |
| 6278 cat( | |
| 6279 sprintf( | |
| 6280 "message: %s\n\\newline\n", | |
| 6281 e$message | |
| 6282 ) | |
| 6283 ) | |
| 6284 cat_margins() | |
| 6285 } | |
| 6286 ) | |
| 3422 } | 6287 } |
| 3423 if (nrow(m) == 1) { | 6288 substrates_df <- with( |
| 3424 cat( | 6289 enriched_substrates, |
| 3425 sprintf( | 6290 data.frame( |
| 3426 "\n\nSubstrate is %s, | 6291 substrate = sub(" ///*", "...", short_row_names), |
| 3427 \nphopshopeptide is %s, | 6292 sequence = trunc_long_ppep(ppep), |
| 3428 \n\nand adjusted ANOVA \\textit{p}-value is %0.2g.\n", | 6293 anova_p_value = signif(fdr_adjusted_anova_p, 2), |
| 3429 enriched_substrates[1, "sub_gene"], | 6294 min_group_obs_count = signif(min_group_obs_count, 0), |
| 3430 enriched_substrates[1, "ppep"], | 6295 quality = signif(quality, 3) |
| 3431 enriched_substrates[1, "fdr_adjusted_anova_p"] | 6296 ) |
| 6297 ) | |
| 6298 excess_substrates <- nrow(substrates_df) > g_intensity_hm_rows | |
| 6299 if (excess_substrates) | |
| 6300 substrates_df <- substrates_df[1:g_intensity_hm_rows, ] | |
| 6301 names(substrates_df) <- headers_2nd_line <- | |
| 6302 c("Substrate", "Sequence", "p-value", "per group)", "quality") | |
| 6303 headers_1st_line <- c("", "", "ANOVA", "min(values", "") | |
| 6304 if (1 < nrow(enriched_substrates)) | |
| 6305 cat("\n\\newpage\n") | |
| 6306 cat(subsubsection_header( | |
| 6307 sprintf( | |
| 6308 "Details for %s%s-substrates", | |
| 6309 if (excess_substrates) | |
| 6310 sprintf( | |
| 6311 "%s \"highest quality\" ", | |
| 6312 g_intensity_hm_rows | |
| 3432 ) | 6313 ) |
| 6314 else "", | |
| 6315 kinase_name | |
| 3433 ) | 6316 ) |
| 3434 } | 6317 )) |
| 6318 substrates_df <- substrates_df[order(-substrates_df$quality), ] | |
| 6319 data_frame_tabbing_latex( | |
| 6320 x = substrates_df, | |
| 6321 tabstops = c(0.8, 3.8, 0.6, 0.8), | |
| 6322 headings = c(headers_1st_line, headers_2nd_line) | |
| 6323 ) | |
| 6324 } else { | |
| 6325 if (print_nb_messages) nbe(see_variable(nrow_m > 0), "\n") | |
| 3435 } | 6326 } |
| 3436 } | 6327 if (print_nb_messages) nb("end kinase ", kinase_name, "\n") |
| 3437 | 6328 } |
| 3438 # Write output tabular files | 6329 |
| 3439 | 6330 # Write output tabular files |
| 3440 # get kinase, ppep, concat(kinase) tuples for enriched kinases | 6331 |
| 3441 | 6332 # get kinase, ppep, concat(kinase) tuples for enriched kinases |
| 3442 kinase_ppep_label <- sqldf(" | 6333 |
| 3443 WITH | 6334 if (print_nb_messages) nb("kinase_ppep_label <- ...\n") |
| 3444 t(ppep, label) AS | 6335 if (print_nb_messages) nbe("kinase_ppep_label <- ...\n") |
| 3445 ( | 6336 kinase_ppep_label <- sqldf(" |
| 3446 SELECT DISTINCT | 6337 WITH |
| 3447 SUB_MOD_RSD AS ppep, | 6338 t(ppep, label) AS |
| 3448 group_concat(gene, '; ') AS label | 6339 ( |
| 6340 SELECT DISTINCT | |
| 6341 SUB_MOD_RSD AS ppep, | |
| 6342 group_concat(gene, '; ') AS label | |
| 6343 FROM pseudo_ksdata | |
| 6344 WHERE GENE IN (SELECT kinase FROM enriched_kinases) | |
| 6345 GROUP BY ppep | |
| 6346 ), | |
| 6347 k(kinase, ppep_join) AS | |
| 6348 ( | |
| 6349 SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep_join | |
| 3449 FROM pseudo_ksdata | 6350 FROM pseudo_ksdata |
| 3450 WHERE GENE IN (SELECT kinase FROM enriched_kinases) | 6351 WHERE GENE IN (SELECT kinase FROM enriched_kinases) |
| 3451 GROUP BY ppep | 6352 ) |
| 3452 ), | 6353 SELECT k.kinase, t.ppep, t.label |
| 3453 k(kinase, ppep_join) AS | 6354 FROM t, k |
| 3454 ( | 6355 WHERE t.ppep = k.ppep_join |
| 3455 SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep_join | 6356 ORDER BY k.kinase, t.ppep |
| 3456 FROM pseudo_ksdata | 6357 ") |
| 3457 WHERE GENE IN (SELECT kinase FROM enriched_kinases) | 6358 |
| 3458 ) | 6359 |
| 3459 SELECT k.kinase, t.ppep, t.label | 6360 # extract what we need from full_data |
| 3460 FROM t, k | 6361 impish <- cbind(rownames(quant_data_imp), quant_data_imp) |
| 3461 WHERE t.ppep = k.ppep_join | 6362 colnames(impish)[1] <- "Phosphopeptide" |
| 3462 ORDER BY k.kinase, t.ppep | 6363 data_table_imputed_sql <- " |
| 3463 ") | 6364 SELECT |
| 3464 | 6365 f.*, |
| 3465 # extract what we need from full_data | 6366 k.label AS KSEA_enrichments, |
| 3466 impish <- cbind(rownames(quant_data_imp), quant_data_imp) | 6367 q.* |
| 3467 colnames(impish)[1] <- "Phosphopeptide" | 6368 FROM |
| 3468 data_table_imputed_sql <- " | 6369 metadata_plus_p f |
| 3469 SELECT | 6370 LEFT JOIN kinase_ppep_label k |
| 3470 f.*, | 6371 ON f.Phosphopeptide = k.ppep, |
| 3471 k.label AS KSEA_enrichments, | 6372 impish q |
| 3472 q.* | 6373 WHERE |
| 3473 FROM | 6374 f.Phosphopeptide = q.Phosphopeptide |
| 3474 metadata_plus_p f | 6375 " |
| 3475 LEFT JOIN kinase_ppep_label k | 6376 data_table_imputed <- sqldf(data_table_imputed_sql) |
| 3476 ON f.Phosphopeptide = k.ppep, | 6377 # Zap the duplicated 'Phosphopeptide' column named 'ppep' |
| 3477 impish q | 6378 data_table_imputed <- |
| 3478 WHERE | 6379 data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))] |
| 3479 f.Phosphopeptide = q.Phosphopeptide | 6380 |
| 3480 " | 6381 # Output imputed, un-normalized data |
| 3481 data_table_imputed <- sqldf(data_table_imputed_sql) | 6382 if (print_nb_messages) nb("Output imputed, un-normalized data tabular file\n") |
| 3482 # Zap the duplicated 'Phosphopeptide' column named 'ppep' | 6383 if (print_nb_messages) nbe("Output imputed, un-normalized data tabular file\n") |
| 3483 data_table_imputed <- | 6384 write.table( |
| 3484 data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))] | 6385 data_table_imputed |
| 3485 | 6386 , file = imputed_data_filename |
| 3486 # Output with imputed, un-normalized data | 6387 , sep = "\t" |
| 3487 | 6388 , col.names = TRUE |
| 3488 write.table( | 6389 , row.names = FALSE |
| 3489 data_table_imputed | 6390 , quote = FALSE |
| 3490 , file = imputed_data_filename | 6391 ) |
| 3491 , sep = "\t" | 6392 |
| 3492 , col.names = TRUE | 6393 |
| 3493 , row.names = FALSE | 6394 #output quantile normalized data |
| 3494 , quote = FALSE | 6395 impish <- cbind(rownames(quant_data_imp_qn_log), quant_data_imp_qn_log) |
| 3495 ) | 6396 colnames(impish)[1] <- "Phosphopeptide" |
| 3496 | 6397 data_table_imputed <- sqldf(data_table_imputed_sql) |
| 3497 | 6398 # Zap the duplicated 'Phosphopeptide' column named 'ppep' |
| 3498 #output quantile normalized data | 6399 data_table_imputed <- |
| 3499 impish <- cbind(rownames(quant_data_imp_qn_log), quant_data_imp_qn_log) | 6400 data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))] |
| 3500 colnames(impish)[1] <- "Phosphopeptide" | 6401 if (print_nb_messages) nb("Output quantile normalized data tabular file\n") |
| 3501 data_table_imputed <- sqldf(data_table_imputed_sql) | 6402 if (print_nb_messages) nbe("Output quantile normalized data tabular file\n") |
| 3502 # Zap the duplicated 'Phosphopeptide' column named 'ppep' | 6403 write.table( |
| 3503 data_table_imputed <- | 6404 data_table_imputed, |
| 3504 data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))] | 6405 file = imp_qn_lt_data_filenm, |
| 3505 write.table( | 6406 sep = "\t", |
| 3506 data_table_imputed, | 6407 col.names = TRUE, |
| 3507 file = imp_qn_lt_data_filenm, | 6408 row.names = FALSE, |
| 3508 sep = "\t", | 6409 quote = FALSE |
| 3509 col.names = TRUE, | 6410 ) |
| 3510 row.names = FALSE, | 6411 |
| 3511 quote = FALSE | 6412 ppep_kinase <- sqldf(" |
| 3512 ) | 6413 SELECT DISTINCT k.ppep, k.kinase |
| 3513 | 6414 FROM ( |
| 3514 ppep_kinase <- sqldf(" | 6415 SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep |
| 3515 SELECT DISTINCT k.ppep, k.kinase | 6416 FROM pseudo_ksdata |
| 3516 FROM ( | 6417 WHERE GENE IN (SELECT kinase FROM enriched_kinases) |
| 3517 SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep | 6418 ) k |
| 3518 FROM pseudo_ksdata | 6419 ORDER BY k.ppep, k.kinase |
| 3519 WHERE GENE IN (SELECT kinase FROM enriched_kinases) | 6420 ") |
| 3520 ) k | 6421 |
| 3521 ORDER BY k.ppep, k.kinase | 6422 RSQLite::dbWriteTable( |
| 3522 ") | 6423 conn = db, |
| 3523 | 6424 name = "ksea_enriched_ks", |
| 3524 RSQLite::dbWriteTable( | 6425 value = ppep_kinase, |
| 3525 conn = db, | 6426 append = FALSE |
| 3526 name = "ksea_enriched_ks", | 6427 ) |
| 3527 value = ppep_kinase, | 6428 } |
| 3528 append = FALSE | 6429 |
| 3529 ) | 6430 if (print_nb_messages) nb("RSQLite::dbWriteTable anova_signif\n") |
| 3530 | 6431 |
| 3531 RSQLite::dbWriteTable( | 6432 RSQLite::dbWriteTable( |
| 3532 conn = db, | 6433 conn = db, |
| 3533 name = "anova_signif", | 6434 name = "anova_signif", |
| 3534 value = p_value_data, | 6435 value = p_value_data, |
| 3554 ON m.phospho_peptide = kek.ppep | 6455 ON m.phospho_peptide = kek.ppep |
| 3555 ; | 6456 ; |
| 3556 " | 6457 " |
| 3557 ) | 6458 ) |
| 3558 | 6459 |
| 6460 if (print_nb_messages) nb("Output contents of `stats_metadata_v` table to tabular file\n") | |
| 6461 if (print_nb_messages) nbe("Output contents of `stats_metadata_v` table to tabular file\n") | |
| 3559 write.table( | 6462 write.table( |
| 3560 dbReadTable(db, "stats_metadata_v"), | 6463 dbReadTable(db, "stats_metadata_v"), |
| 3561 file = anova_ksea_mtdt_file, | 6464 file = anova_ksea_mtdt_file, |
| 3562 sep = "\t", | 6465 sep = "\t", |
| 3563 col.names = TRUE, | 6466 col.names = TRUE, |
| 3564 row.names = FALSE, | 6467 row.names = FALSE, |
| 3565 quote = FALSE | 6468 quote = FALSE |
| 3566 ) | 6469 ) |
| 3567 | 6470 |
| 6471 cat("\n\\clearpage\n") | |
| 3568 | 6472 |
| 3569 ``` | 6473 ``` |
| 6474 | |
| 6475 # Data-processing summary flowchart | |
| 6476 | |
| 6477  | |
| 3570 | 6478 |
| 3571 ```{r parmlist, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} | 6479 ```{r parmlist, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} |
| 3572 cat("\\leavevmode\n\n\n") | 6480 cat("\\leavevmode\n\n\n") |
| 3573 | 6481 |
| 3574 # write parameters to report | 6482 write_params(db) |
| 3575 | |
| 3576 param_unlist <- unlist(as.list(params)) | |
| 3577 param_df <- data.frame( | |
| 3578 parameter = paste0("\\verb@", names(param_unlist), "@"), | |
| 3579 value = paste0( | |
| 3580 "\n\\begin{tiny}\n\\verb@", | |
| 3581 gsub("$", "\\$", param_unlist, fixed = TRUE), | |
| 3582 "@\n\\end{tiny}" | |
| 3583 ) | |
| 3584 ) | |
| 3585 | |
| 3586 data_frame_latex( | |
| 3587 x = param_df, | |
| 3588 justification = "p{0.35\\linewidth} p{0.6\\linewidth}", | |
| 3589 centered = TRUE, | |
| 3590 caption = "Input parameters", | |
| 3591 anchor = const_table_anchor_bp, | |
| 3592 underscore_whack = FALSE | |
| 3593 ) | |
| 3594 | |
| 3595 # write parameters to SQLite output | |
| 3596 | |
| 3597 mqppep_anova_script_param_df <- data.frame( | |
| 3598 script = "mqppep_anova_script.Rmd", | |
| 3599 parameter = names(param_unlist), | |
| 3600 value = param_unlist | |
| 3601 ) | |
| 3602 ddl_exec(db, " | |
| 3603 DROP TABLE IF EXISTS script_parameter; | |
| 3604 " | |
| 3605 ) | |
| 3606 ddl_exec(db, " | |
| 3607 CREATE TABLE IF NOT EXISTS script_parameter( | |
| 3608 script TEXT, | |
| 3609 parameter TEXT, | |
| 3610 value ANY, | |
| 3611 UNIQUE (script, parameter) ON CONFLICT REPLACE | |
| 3612 ) | |
| 3613 ; | |
| 3614 " | |
| 3615 ) | |
| 3616 RSQLite::dbWriteTable( | |
| 3617 conn = db, | |
| 3618 name = "script_parameter", | |
| 3619 value = mqppep_anova_script_param_df, | |
| 3620 append = TRUE | |
| 3621 ) | |
| 3622 | |
| 3623 # We are done with output | 6483 # We are done with output |
| 3624 RSQLite::dbDisconnect(db) | 6484 RSQLite::dbDisconnect(db) |
| 6485 | |
| 6486 cat("\\clearpage\n\\section{R package versions}\n") | |
| 6487 utils::toLatex(utils::sessionInfo()) | |
| 3625 ``` | 6488 ``` |
| 3626 <!-- | |
| 3627 There's gotta be a better way... | |
| 3628 | |
| 3629 loaded_packages_df <- sessioninfo::package_info("loaded") | |
| 3630 loaded_packages_df[, "library"] <- as.character(loaded_packages_df$library) | |
| 3631 loaded_packages_df <- data.frame( | |
| 3632 package = loaded_packages_df$package, | |
| 3633 version = loaded_packages_df$loadedversion, | |
| 3634 date = loaded_packages_df$date | |
| 3635 ) | |
| 3636 data_frame_latex( | |
| 3637 x = loaded_packages_df, | |
| 3638 justification = "l | l l", | |
| 3639 centered = FALSE, | |
| 3640 caption = "Loaded R packages", | |
| 3641 anchor = const_table_anchor_bp | |
| 3642 ) | |
| 3643 --> |
