Mercurial > repos > eschen42 > mqppep_anova
comparison MaxQuantProcessingScript.R @ 23:3911581e639a draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit b18713d25e7b260d6cdaf99fb216a4d1b3014c47
| author | eschen42 |
|---|---|
| date | Mon, 11 Jul 2022 13:51:14 +0000 |
| parents | |
| children | 5b8e15b2a67c |
comparison
equal
deleted
inserted
replaced
| 22:61adb8801b73 | 23:3911581e639a |
|---|---|
| 1 #!/usr/bin/env Rscript | |
| 2 | |
| 3 # This is the implementation for the | |
| 4 # "MaxQuant Phosphopeptide Localization Probability Cutoff" | |
| 5 # Galaxy tool (mqppep_lclztn_filter) | |
| 6 # It is adapted from the MaxQuant Processing Script written by Larry Cheng. | |
| 7 | |
| 8 # libraries | |
| 9 library(optparse) | |
| 10 library(data.table) | |
| 11 library(stringr) | |
| 12 library(ggplot2) | |
| 13 | |
| 14 # title: "MaxQuant Processing Script" | |
| 15 # author: "Larry Cheng" | |
| 16 # date: "February 19, 2018" | |
| 17 # | |
| 18 # # MaxQuant Processing Script | |
| 19 # Takes MaxQuant Phospho (STY)sites.txt file as input | |
| 20 # and performs the following (in order): | |
| 21 # 1) Runs the Proteomics Quality Control software | |
| 22 # 2) Remove contaminant and reverse sequence rows | |
| 23 # 3) Filters rows based on localization probability | |
| 24 # 4) Extract the quantitative data | |
| 25 # 5) Sequences phosphopeptides | |
| 26 # 6) Merges multiply phosphorylated peptides | |
| 27 # 7) Filters out phosphopeptides based on enrichment | |
| 28 # The output file contains the phosphopeptide (first column) | |
| 29 # and the quantitative values for each sample. | |
| 30 # | |
| 31 # ## Revision History | |
| 32 # Rev. 2022-02-10 :wrap for inclusion in Galaxy | |
| 33 # Rev. 2018-02-19 :break up analysis script into "MaxQuant Processing Script" | |
| 34 # and "Phosphopeptide Processing Script" | |
| 35 # Rev. 2017-12-12 :added PTXQC | |
| 36 # added additional plots and table outputs for quality control | |
| 37 # allowed for more than 2 samples to be grouped together | |
| 38 # (up to 26 (eg, 1A, 1B, 1C, etc)) | |
| 39 # converted from .r to .rmd file to knit report | |
| 40 # for quality control | |
| 41 # Rev. 2016-09-11 :automated the FDR cutoffs; removed the option to data | |
| 42 # impute multiple times | |
| 43 # Rev. 2016-09-09 :added filter to eliminate contaminant & reverse sequence rows | |
| 44 # Rev. 2016-09-01 :moved the collapse step from after ANOVA filter to prior to | |
| 45 # preANOVA file output | |
| 46 # Rev. 2016-08-22 :use regexSampleNames <- "\\.(\\d + )[AB]$" | |
| 47 # so that it looks at the end of string | |
| 48 # Rev. 2016-08-05 :Removed vestigial line (ppeptides <- ....) | |
| 49 # Rev. 2016-07-03 :Removed row names from the write.table() output for | |
| 50 # ANOVA and PreANOVA | |
| 51 # Rev. 2016-06-25 :Set default Localization Probability cutoff to 0.75 | |
| 52 # Rev. 2016-06-23 :fixed a bug in filtering for pY enrichment by resetting | |
| 53 # the row numbers afterwards | |
| 54 # Rev. 2016-06-21 :test18 + standardized the regexpression in protocol | |
| 55 | |
| 56 | |
| 57 ### FUNCTION DECLARATIONS begin ---------------------------------------------- | |
| 58 | |
| 59 # Read first line of file at filePath | |
| 60 # adapted from: https://stackoverflow.com/a/35761217/15509512 | |
| 61 read_first_line <- function(filepath) { | |
| 62 con <- file(filepath, "r") | |
| 63 line <- readLines(con, n = 1) | |
| 64 close(con) | |
| 65 return(line) | |
| 66 } | |
| 67 | |
| 68 # Move columns to the end of dataframe | |
| 69 # - data: the dataframe | |
| 70 # - move: a vector of column names, each of which is an element of names(data) | |
| 71 movetolast <- function(data, move) { | |
| 72 data[c(setdiff(names(data), move), move)] | |
| 73 } | |
| 74 | |
| 75 # Generate phosphopeptide and build list when applied | |
| 76 phosphopeptide_func <- function(df) { | |
| 77 # generate peptide sequence and list of phosphopositions | |
| 78 phosphoprobsequence <- | |
| 79 strsplit(as.character(df["Phospho (STY) Score diffs"]), "")[[1]] | |
| 80 output <- vector() | |
| 81 phosphopeptide <- "" | |
| 82 counter <- 0 # keep track of position in peptide | |
| 83 phosphopositions <- | |
| 84 vector() # keep track of phosphorylation positions in peptide | |
| 85 score_diff <- "" | |
| 86 for (chara in phosphoprobsequence) { | |
| 87 # build peptide sequence | |
| 88 if (!( | |
| 89 chara == " " || | |
| 90 chara == "(" || | |
| 91 chara == ")" || | |
| 92 chara == "." || | |
| 93 chara == "-" || | |
| 94 chara == "0" || | |
| 95 chara == "1" || | |
| 96 chara == "2" || | |
| 97 chara == "3" || | |
| 98 chara == "4" || | |
| 99 chara == "5" || | |
| 100 chara == "6" || | |
| 101 chara == "7" || | |
| 102 chara == "8" || | |
| 103 chara == "9") | |
| 104 ) { | |
| 105 phosphopeptide <- paste(phosphopeptide, chara, sep = "") | |
| 106 counter <- counter + 1 | |
| 107 } | |
| 108 # generate score_diff | |
| 109 if (chara == "-" || | |
| 110 chara == "." || | |
| 111 chara == "0" || | |
| 112 chara == "1" || | |
| 113 chara == "2" || | |
| 114 chara == "3" || | |
| 115 chara == "4" || | |
| 116 chara == "5" || | |
| 117 chara == "6" || | |
| 118 chara == "7" || | |
| 119 chara == "8" || | |
| 120 chara == "9" | |
| 121 ) { | |
| 122 score_diff <- paste(score_diff, chara, sep = "") | |
| 123 } | |
| 124 # evaluate score_diff | |
| 125 if (chara == ")") { | |
| 126 score_diff <- as.numeric(score_diff) | |
| 127 # only consider a phosphoresidue if score_diff > 0 | |
| 128 if (score_diff > 0) { | |
| 129 phosphopositions <- append(phosphopositions, counter) | |
| 130 } | |
| 131 score_diff <- "" | |
| 132 } | |
| 133 } | |
| 134 | |
| 135 # generate phosphopeptide sequence (ie, peptide sequence with "p"'s) | |
| 136 counter <- 1 | |
| 137 phosphoposition_correction1 <- | |
| 138 -1 # used to correct phosphosposition as "p"'s | |
| 139 # are inserted into the phosphopeptide string | |
| 140 phosphoposition_correction2 <- | |
| 141 0 # used to correct phosphosposition as "p"'s | |
| 142 # are inserted into the phosphopeptide string | |
| 143 while (counter <= length(phosphopositions)) { | |
| 144 phosphopeptide <- | |
| 145 paste( | |
| 146 substr( | |
| 147 phosphopeptide, | |
| 148 0, | |
| 149 phosphopositions[counter] + phosphoposition_correction1 | |
| 150 ), | |
| 151 "p", | |
| 152 substr( | |
| 153 phosphopeptide, | |
| 154 phosphopositions[counter] + phosphoposition_correction2, | |
| 155 nchar(phosphopeptide) | |
| 156 ), | |
| 157 sep = "" | |
| 158 ) | |
| 159 counter <- counter + 1 | |
| 160 phosphoposition_correction1 <- phosphoposition_correction1 + 1 | |
| 161 phosphoposition_correction2 <- phosphoposition_correction2 + 1 | |
| 162 } | |
| 163 # building phosphopeptide list | |
| 164 output <- append(output, phosphopeptide) | |
| 165 return(output) | |
| 166 } | |
| 167 | |
| 168 ### FUNCTION DECLARATIONS end ------------------------------------------------ | |
| 169 | |
| 170 | |
| 171 ### EXTRACT ARGUMENTS begin -------------------------------------------------- | |
| 172 | |
| 173 # parse options | |
| 174 option_list <- list( | |
| 175 make_option( | |
| 176 c("-i", "--input"), | |
| 177 action = "store", | |
| 178 type = "character", | |
| 179 help = "A MaxQuant Phospho (STY)Sites.txt" | |
| 180 ) | |
| 181 , | |
| 182 make_option( | |
| 183 c("-o", "--output"), | |
| 184 action = "store", | |
| 185 type = "character", | |
| 186 help = "path to output file" | |
| 187 ) | |
| 188 , | |
| 189 make_option( | |
| 190 c("-E", "--enrichGraph"), | |
| 191 action = "store", | |
| 192 type = "character", | |
| 193 help = "path to enrichment graph PDF" | |
| 194 ) | |
| 195 , | |
| 196 make_option( | |
| 197 c("-F", "--enrichGraph_svg"), | |
| 198 action = "store", | |
| 199 type = "character", | |
| 200 help = "path to enrichment graph SVG" | |
| 201 ) | |
| 202 , | |
| 203 make_option( | |
| 204 c("-L", "--locProbCutoffGraph"), | |
| 205 action = "store", | |
| 206 type = "character", | |
| 207 help = "path to location-proability cutoff graph PDF" | |
| 208 ) | |
| 209 , | |
| 210 make_option( | |
| 211 c("-M", "--locProbCutoffGraph_svg"), | |
| 212 action = "store", | |
| 213 type = "character", | |
| 214 help = "path to location-proability cutoff graph SVG" | |
| 215 ) | |
| 216 , | |
| 217 make_option( | |
| 218 c("-e", "--enriched"), | |
| 219 action = "store", | |
| 220 type = "character", | |
| 221 help = "pY or pST enriched samples (ie, 'Y' or 'ST')" | |
| 222 ) | |
| 223 # default = "^Number of Phospho [(]STY[)]$", | |
| 224 , | |
| 225 make_option( | |
| 226 c("-p", "--phosphoCol"), | |
| 227 action = "store", | |
| 228 type = "character", | |
| 229 help = paste0("PERL-compatible regular expression matching", | |
| 230 " header of column having number of 'Phospho (STY)'") | |
| 231 ) | |
| 232 # default = "^Intensity[^_]", | |
| 233 , | |
| 234 make_option( | |
| 235 c("-s", "--startCol"), | |
| 236 action = "store", | |
| 237 type = "character", | |
| 238 help = paste0("PERL-compatible regular expression matching", | |
| 239 " header of column having first sample intensity") | |
| 240 ) | |
| 241 # default = 1, | |
| 242 , | |
| 243 make_option( | |
| 244 c("-I", "--intervalCol"), | |
| 245 action = "store", | |
| 246 type = "integer", | |
| 247 help = paste0("Column interval between the Intensities of samples", | |
| 248 " (eg, 1 if subsequent column; 2 if every other column") | |
| 249 ) | |
| 250 # default = 0.75, | |
| 251 , | |
| 252 make_option( | |
| 253 c("-l", "--localProbCutoff"), | |
| 254 action = "store", | |
| 255 type = "double", | |
| 256 help = "Localization Probability Cutoff" | |
| 257 ) | |
| 258 # default = "sum", | |
| 259 , | |
| 260 make_option( | |
| 261 c("-f", "--collapse_func"), | |
| 262 action = "store", | |
| 263 type = "character", | |
| 264 help = paste0("merge identical phosphopeptides", | |
| 265 " by ('sum' or 'average') the intensities") | |
| 266 ) | |
| 267 # default = "filtered_data.txt", | |
| 268 , | |
| 269 make_option( | |
| 270 c("-r", "--filtered_data"), | |
| 271 action = "store", | |
| 272 type = "character", | |
| 273 help = "filtered_data.txt" | |
| 274 ) | |
| 275 # default = "quantData.txt", | |
| 276 , | |
| 277 make_option( | |
| 278 c("-q", "--quant_data"), | |
| 279 action = "store", | |
| 280 type = "character", | |
| 281 help = "quantData.txt" | |
| 282 ) | |
| 283 ) | |
| 284 args <- parse_args(OptionParser(option_list = option_list)) | |
| 285 # Check parameter values | |
| 286 | |
| 287 ### EXTRACT ARGUMENTS end ---------------------------------------------------- | |
| 288 | |
| 289 | |
| 290 ### EXTRACT PARAMETERS from arguments begin ---------------------------------- | |
| 291 | |
| 292 if (!file.exists(args$input)) { | |
| 293 stop((paste("File", args$input, "does not exist"))) | |
| 294 } | |
| 295 | |
| 296 phospho_col_pattern <- "^Number of Phospho [(][STY][STY]*[)]$" | |
| 297 start_col_pattern <- "^Intensity[^_]" | |
| 298 phospho_col_pattern <- read_first_line(args$phosphoCol) | |
| 299 start_col_pattern <- read_first_line(args$startCol) | |
| 300 | |
| 301 sink(getConnection(2)) | |
| 302 | |
| 303 input_file_name <- args$input | |
| 304 filtered_filename <- args$filtered_data | |
| 305 quant_file_name <- args$quant_data | |
| 306 interval_col <- as.integer(args$intervalCol) | |
| 307 | |
| 308 first_line <- read_first_line(input_file_name) | |
| 309 col_headers <- | |
| 310 unlist(strsplit( | |
| 311 x = first_line, | |
| 312 split = c("\t"), | |
| 313 fixed = TRUE | |
| 314 )) | |
| 315 sink(getConnection(2)) | |
| 316 sink() | |
| 317 | |
| 318 | |
| 319 intensity_header_cols <- | |
| 320 grep(pattern = start_col_pattern, x = col_headers, perl = TRUE) | |
| 321 if (length(intensity_header_cols) == 0) { | |
| 322 err_msg <- | |
| 323 paste("Found no intensity columns matching pattern:", | |
| 324 start_col_pattern) | |
| 325 # Divert output to stderr | |
| 326 sink(getConnection(2)) | |
| 327 print(err_msg) | |
| 328 sink() | |
| 329 stop(err_msg) | |
| 330 } | |
| 331 | |
| 332 | |
| 333 phospho_col <- | |
| 334 grep(pattern = phospho_col_pattern, x = col_headers, perl = TRUE)[1] | |
| 335 if (is.na(phospho_col)) { | |
| 336 err_msg <- | |
| 337 paste("Found no 'number of phospho sites' columns matching pattern:", | |
| 338 phospho_col_pattern) | |
| 339 # Divert output to stderr | |
| 340 sink(getConnection(2)) | |
| 341 print(err_msg) | |
| 342 sink() | |
| 343 stop(err_msg) | |
| 344 } | |
| 345 | |
| 346 | |
| 347 i_count <- 0 | |
| 348 this_column <- 1 | |
| 349 last_value <- intensity_header_cols[1] | |
| 350 intensity_cols <- c(last_value) | |
| 351 | |
| 352 while (length(intensity_header_cols) >= interval_col * i_count) { | |
| 353 i_count <- 1 + i_count | |
| 354 this_column <- interval_col + this_column | |
| 355 if (last_value + interval_col != intensity_header_cols[this_column]) | |
| 356 break | |
| 357 last_value <- intensity_header_cols[this_column] | |
| 358 if (length(intensity_header_cols) < interval_col * i_count) | |
| 359 break | |
| 360 intensity_cols <- | |
| 361 c(intensity_cols, intensity_header_cols[this_column]) | |
| 362 } | |
| 363 | |
| 364 start_col <- intensity_cols[1] | |
| 365 num_samples <- i_count | |
| 366 | |
| 367 output_filename <- args$output | |
| 368 enrich_graph_filename <- args$enrichGraph | |
| 369 loc_prob_cutoff_graph_filename <- args$locProbCutoffGraph | |
| 370 enrich_graph_filename_svg <- args$enrichGraph_svg | |
| 371 loc_prob_cutoff_graph_fn_svg <- args$locProbCutoffGraph_svg | |
| 372 | |
| 373 local_prob_cutoff <- args$localProbCutoff | |
| 374 enriched <- args$enriched | |
| 375 collapse_fn <- args$collapse_func | |
| 376 | |
| 377 ### EXTRACT PARAMETERS from arguments end ------------------------------------ | |
| 378 | |
| 379 | |
| 380 # Proteomics Quality Control for MaxQuant Results | |
| 381 # (Bielow C et al. J Proteome Res. 2016 PMID: 26653327) | |
| 382 # is run by the Galaxy MaxQuant wrapper and need not be invoked here. | |
| 383 | |
| 384 | |
| 385 # Read & filter out contaminants, reverse sequences, & localization probability | |
| 386 # --- | |
| 387 full_data <- | |
| 388 read.table( | |
| 389 file = input_file_name, | |
| 390 sep = "\t", | |
| 391 header = TRUE, | |
| 392 quote = "" | |
| 393 ) | |
| 394 | |
| 395 # Filter out contaminant rows and reverse rows | |
| 396 filtered_data <- subset(full_data, !grepl("CON__", Proteins)) | |
| 397 filtered_data <- | |
| 398 subset(filtered_data, !grepl("_MYCOPLASMA", Proteins)) | |
| 399 filtered_data <- | |
| 400 subset(filtered_data, !grepl("CONTAMINANT_", Proteins)) | |
| 401 filtered_data <- | |
| 402 subset(filtered_data, !grepl("REV__", Protein) | |
| 403 ) # since REV__ rows are blank in the first column (Proteins) | |
| 404 write.table( | |
| 405 filtered_data, | |
| 406 file = filtered_filename, | |
| 407 sep = "\t", | |
| 408 quote = FALSE, | |
| 409 col.names = TRUE, | |
| 410 row.names = FALSE | |
| 411 ) | |
| 412 # ... | |
| 413 | |
| 414 | |
| 415 # Filter out data with localization probability below localProbCutoff | |
| 416 # --- | |
| 417 # Data filtered by localization probability | |
| 418 loc_prob_filtered_data <- | |
| 419 filtered_data[ | |
| 420 filtered_data$Localization.prob >= local_prob_cutoff, | |
| 421 ] | |
| 422 # ... | |
| 423 | |
| 424 | |
| 425 # Localization probability -- visualize locprob cutoff | |
| 426 # --- | |
| 427 loc_prob_graph_data <- | |
| 428 data.frame( | |
| 429 group = c(paste(">", toString(local_prob_cutoff), sep = ""), | |
| 430 paste("<", toString(local_prob_cutoff), sep = "")), | |
| 431 value = c( | |
| 432 nrow(loc_prob_filtered_data) / nrow(filtered_data) * 100, | |
| 433 (nrow(filtered_data) - nrow(loc_prob_filtered_data)) | |
| 434 / nrow(filtered_data) * 100 | |
| 435 ) | |
| 436 ) | |
| 437 gigi <- | |
| 438 ggplot(loc_prob_graph_data, aes(x = "", y = value, fill = group)) + | |
| 439 geom_bar(width = 0.5, | |
| 440 stat = "identity", | |
| 441 color = "black") + | |
| 442 labs(x = NULL, | |
| 443 y = "percent", | |
| 444 title = "Phosphopeptides partitioned by localization-probability cutoff" | |
| 445 ) + | |
| 446 scale_fill_discrete(name = "phosphopeptide\nlocalization-\nprobability") + | |
| 447 theme_minimal() + | |
| 448 theme( | |
| 449 legend.position = "right", | |
| 450 legend.title = element_text(), | |
| 451 plot.title = element_text(hjust = 0.5), | |
| 452 plot.subtitle = element_text(hjust = 0.5), | |
| 453 plot.title.position = "plot" | |
| 454 ) | |
| 455 pdf(loc_prob_cutoff_graph_filename) | |
| 456 print(gigi) | |
| 457 dev.off() | |
| 458 svg(loc_prob_cutoff_graph_fn_svg) | |
| 459 print(gigi) | |
| 460 dev.off() | |
| 461 # ... | |
| 462 | |
| 463 | |
| 464 # Extract quantitative values from filtered data | |
| 465 # --- | |
| 466 quant_data <- | |
| 467 loc_prob_filtered_data[, seq(from = start_col, | |
| 468 by = interval_col, | |
| 469 length.out = num_samples)] | |
| 470 # ... | |
| 471 | |
| 472 | |
| 473 # Generate Phosphopeptide Sequence | |
| 474 # for latest version of MaxQuant (Version 1.5.3.30) | |
| 475 # --- | |
| 476 metadata_df <- | |
| 477 data.frame( | |
| 478 loc_prob_filtered_data[, 1:8], | |
| 479 loc_prob_filtered_data[, phospho_col], | |
| 480 loc_prob_filtered_data[, phospho_col + 1], | |
| 481 loc_prob_filtered_data[, phospho_col + 2], | |
| 482 loc_prob_filtered_data[, phospho_col + 3], | |
| 483 loc_prob_filtered_data[, phospho_col + 4], | |
| 484 loc_prob_filtered_data[, phospho_col + 5], | |
| 485 loc_prob_filtered_data[, phospho_col + 6], | |
| 486 loc_prob_filtered_data[, phospho_col + 7], | |
| 487 quant_data | |
| 488 ) | |
| 489 colnames(metadata_df) <- | |
| 490 c( | |
| 491 "Proteins", | |
| 492 "Positions within proteins", | |
| 493 "Leading proteins", | |
| 494 "Protein", | |
| 495 "Protein names", | |
| 496 "Gene names", | |
| 497 "Fasta headers", | |
| 498 "Localization prob", | |
| 499 "Number of Phospho (STY)", | |
| 500 "Amino Acid", | |
| 501 "Sequence window", | |
| 502 "Modification window", | |
| 503 "Peptide window coverage", | |
| 504 "Phospho (STY) Probabilities", | |
| 505 "Phospho (STY) Score diffs", | |
| 506 "Position in peptide", | |
| 507 colnames(quant_data) | |
| 508 ) | |
| 509 # 'phosphopeptide_func' generates a phosphopeptide sequence | |
| 510 # for each row of data. | |
| 511 # for the 'apply' function: MARGIN 1 == rows, 2 == columns, c(1, 2) = both | |
| 512 metadata_df$phosphopeptide <- | |
| 513 apply(X = metadata_df, MARGIN = 1, FUN = phosphopeptide_func) | |
| 514 colnames(metadata_df)[1] <- "Phosphopeptide" | |
| 515 # Move the quant data columns to the right end of the data.frame | |
| 516 metadata_df <- movetolast(metadata_df, c(colnames(quant_data))) | |
| 517 # ... | |
| 518 | |
| 519 | |
| 520 # Write quantitative values for debugging purposes | |
| 521 # --- | |
| 522 quant_write <- cbind(metadata_df[, "Sequence window"], quant_data) | |
| 523 colnames(quant_write)[1] <- "Sequence.Window" | |
| 524 write.table( | |
| 525 quant_write, | |
| 526 file = quant_file_name, | |
| 527 sep = "\t", | |
| 528 quote = FALSE, | |
| 529 col.names = TRUE, | |
| 530 row.names = FALSE | |
| 531 ) | |
| 532 # ... | |
| 533 | |
| 534 | |
| 535 # Make new data frame containing only Phosphopeptides | |
| 536 # that are to be mapped to quant data (merge_df) | |
| 537 # --- | |
| 538 metadata_df <- | |
| 539 setDT(metadata_df, keep.rownames = TRUE) # row name will be used to map | |
| 540 merge_df <- | |
| 541 data.frame( | |
| 542 as.integer(metadata_df$rn), | |
| 543 metadata_df$phosphopeptide # row index to merge data frames | |
| 544 ) | |
| 545 colnames(merge_df) <- c("rn", "Phosphopeptide") | |
| 546 # ... | |
| 547 | |
| 548 | |
| 549 # Add Phosphopeptide column to quant columns for quality control checking | |
| 550 # --- | |
| 551 quant_data_qc <- as.data.frame(quant_data) | |
| 552 setDT(quant_data_qc, keep.rownames = TRUE) # will use to match rowname to data | |
| 553 quant_data_qc$rn <- as.integer(quant_data_qc$rn) | |
| 554 quant_data_qc <- merge(merge_df, quant_data_qc, by = "rn") | |
| 555 quant_data_qc$rn <- NULL # remove rn column | |
| 556 # ... | |
| 557 | |
| 558 | |
| 559 # Collapse multiphosphorylated peptides | |
| 560 # --- | |
| 561 quant_data_qc_collapsed <- | |
| 562 data.table(quant_data_qc, key = "Phosphopeptide") | |
| 563 quant_data_qc_collapsed <- | |
| 564 aggregate(. ~ Phosphopeptide, quant_data_qc, FUN = collapse_fn) | |
| 565 # ... | |
| 566 print("quant_data_qc_collapsed") | |
| 567 head(quant_data_qc_collapsed) | |
| 568 | |
| 569 # Compute (as string) % of phosphopeptides that are multiphosphorylated | |
| 570 # (for use in next step) | |
| 571 # --- | |
| 572 pct_multiphos <- | |
| 573 ( | |
| 574 nrow(quant_data_qc) - nrow(quant_data_qc_collapsed) | |
| 575 ) / (2 * nrow(quant_data_qc)) | |
| 576 pct_multiphos <- sprintf("%0.1f%s", 100 * pct_multiphos, "%") | |
| 577 # ... | |
| 578 | |
| 579 | |
| 580 # Compute and visualize breakdown of pY, pS, and pT before enrichment filter | |
| 581 # --- | |
| 582 py_data <- | |
| 583 quant_data_qc_collapsed[ | |
| 584 str_detect(quant_data_qc_collapsed$Phosphopeptide, "pY"), | |
| 585 ] | |
| 586 ps_data <- | |
| 587 quant_data_qc_collapsed[ | |
| 588 str_detect(quant_data_qc_collapsed$Phosphopeptide, "pS"), | |
| 589 ] | |
| 590 pt_data <- | |
| 591 quant_data_qc_collapsed[ | |
| 592 str_detect(quant_data_qc_collapsed$Phosphopeptide, "pT"), | |
| 593 ] | |
| 594 | |
| 595 py_num <- nrow(py_data) | |
| 596 ps_num <- nrow(ps_data) | |
| 597 pt_num <- nrow(pt_data) | |
| 598 | |
| 599 # Visualize enrichment | |
| 600 enrich_graph_data <- data.frame(group = c("pY", "pS", "pT"), | |
| 601 value = c(py_num, ps_num, pt_num)) | |
| 602 | |
| 603 enrich_graph_data <- | |
| 604 enrich_graph_data[ | |
| 605 enrich_graph_data$value > 0, | |
| 606 ] | |
| 607 | |
| 608 # Plot pie chart with legend | |
| 609 # start: https://stackoverflow.com/a/62522478/15509512 | |
| 610 # refine: https://www.statology.org/ggplot-pie-chart/ | |
| 611 # colors: https://colorbrewer2.org/#type=diverging&scheme=BrBG&n=8 | |
| 612 slices <- enrich_graph_data$value | |
| 613 phosphoresidue <- enrich_graph_data$group | |
| 614 pct <- round(100 * slices / sum(slices)) | |
| 615 lbls <- | |
| 616 paste(enrich_graph_data$group, "\n", pct, "%\n(", slices, ")", sep = "") | |
| 617 slc_ctr <- c() | |
| 618 run_tot <- 0 | |
| 619 for (p in pct) { | |
| 620 slc_ctr <- c(slc_ctr, run_tot + p / 2.0) | |
| 621 run_tot <- run_tot + p | |
| 622 } | |
| 623 lbl_y <- 100 - slc_ctr | |
| 624 df <- | |
| 625 data.frame(slices, | |
| 626 pct, | |
| 627 lbls, | |
| 628 phosphoresidue = factor(phosphoresidue, levels = phosphoresidue)) | |
| 629 gigi <- ggplot(df | |
| 630 , aes(x = 1, y = pct, fill = phosphoresidue)) + | |
| 631 geom_col(position = "stack", orientation = "x") + | |
| 632 geom_text(aes(x = 1, y = lbl_y, label = lbls), col = "black") + | |
| 633 coord_polar(theta = "y", direction = -1) + | |
| 634 labs( | |
| 635 x = NULL | |
| 636 , | |
| 637 y = NULL | |
| 638 , | |
| 639 title = "Percentages (and counts) of phosphosites, by type of residue" | |
| 640 , | |
| 641 caption = sprintf( | |
| 642 "Roughly %s of peptides have multiple phosphosites.", | |
| 643 pct_multiphos | |
| 644 ) | |
| 645 ) + | |
| 646 labs(x = NULL, y = NULL, fill = NULL) + | |
| 647 theme_classic() + | |
| 648 theme( | |
| 649 legend.position = "right" | |
| 650 , | |
| 651 axis.line = element_blank() | |
| 652 , | |
| 653 axis.text = element_blank() | |
| 654 , | |
| 655 axis.ticks = element_blank() | |
| 656 , | |
| 657 plot.title = element_text(hjust = 0.5) | |
| 658 , | |
| 659 plot.subtitle = element_text(hjust = 0.5) | |
| 660 , | |
| 661 plot.caption = element_text(hjust = 0.5) | |
| 662 , | |
| 663 plot.title.position = "plot" | |
| 664 ) + | |
| 665 scale_fill_manual(breaks = phosphoresidue, | |
| 666 values = c("#c7eae5", "#f6e8c3", "#dfc27d")) | |
| 667 | |
| 668 pdf(enrich_graph_filename) | |
| 669 print(gigi) | |
| 670 dev.off() | |
| 671 svg(enrich_graph_filename_svg) | |
| 672 print(gigi) | |
| 673 dev.off() | |
| 674 # ... | |
| 675 | |
| 676 | |
| 677 # Filter phosphopeptides by enrichment | |
| 678 # -- | |
| 679 if (enriched == "Y") { | |
| 680 quant_data_qc_enrichment <- quant_data_qc_collapsed[ | |
| 681 str_detect(quant_data_qc_collapsed$Phosphopeptide, "pY"), | |
| 682 ] | |
| 683 } else if (enriched == "ST") { | |
| 684 quant_data_qc_enrichment <- quant_data_qc_collapsed[ | |
| 685 str_detect(quant_data_qc_collapsed$Phosphopeptide, "pS") | | |
| 686 str_detect(quant_data_qc_collapsed$Phosphopeptide, "pT"), | |
| 687 ] | |
| 688 } else { | |
| 689 print("Error in enriched variable. Set to either 'Y' or 'ST'") | |
| 690 } | |
| 691 # ... | |
| 692 | |
| 693 print("quant_data_qc_enrichment") | |
| 694 head(quant_data_qc_enrichment) | |
| 695 | |
| 696 # Write phosphopeptides filtered by enrichment | |
| 697 # -- | |
| 698 write.table( | |
| 699 quant_data_qc_enrichment, | |
| 700 file = output_filename, | |
| 701 sep = "\t", | |
| 702 quote = FALSE, | |
| 703 row.names = FALSE | |
| 704 ) | |
| 705 # ... |
