comparison PMD_FDR_package_for_Galaxy.R @ 0:6dfa920caa52 draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/pdm_fdr commit 53f3dc4ef4742606e24ab2b0e899b2dc56acac83
author galaxyp
date Fri, 27 Sep 2019 14:36:10 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:6dfa920caa52
1 ###############################################################################
2 # PMD_FDR_package_for_Galaxy.R #
3 # #
4 # Project 021 - PMD-FDR for Galaxy-P #
5 # #
6 # Description: Computes iFDR and gFDR on PSMs as a script designed for Galaxy #
7 # Note that plotting code has been left in that is not used #
8 # in this file; this is the code I used to create figures for #
9 # publication. I left it in for potential development of views. #
10 # #
11 # This file was created by concatenating the following files: #
12 # #
13 # A - 005 - Parser - ArgParser.R #
14 # B - 019 - PMD-FDR - functions.R #
15 # C - 021 - PMD-FDR Wrapper - functions.R #
16 # D - 021 - PMD-FDR Main.R #
17 # #
18 # Required packages: argparser #
19 # stringr #
20 # #
21 # Release date: 2019-09-26 #
22 # Version: 1.2 #
23 # #
24 ###############################################################################
25 # Package currently supports the following parameters:
26 #
27 # --psm_report full name and path to the PSM report
28 # --psm_report_1_percent full name and path to the PSM report for 1% FDR
29 # --output_i_fdr full name and path to the i-FDR output file
30 # --output_g_fdr full name and path to the g-FDR output file
31 # --output_densities full name and path to the densities output file
32 # --
33 #
34 ###############################################################################
35 # A - 005 - Parser - ArgParser.R #
36 # #
37 # Description: Wrapper for argparser package, using RefClass #
38 # #
39 ###############################################################################
40
41 #install.packages("argparser")
42 library(argparser)
43
44 # Class definition
45
46 ArgParser <- setRefClass("ArgParser",
47 fields = c("parser"))
48 ArgParser$methods(
49 initialize = function(...){
50 parser <<- arg_parser(...)
51 },
52 local_add_argument = function(...){
53 parser <<- add_argument(parser, ...)
54 },
55 parse_arguments = function(...){
56 result = parse_args(parser, ...)
57 return(result)
58 }
59 )
60
61 ###############################################################################
62 # B - 019 - PMD-FDR - functions.R #
63 # #
64 # Primary work-horse for PMD-FDR #
65 # #
66 ###############################################################################
67 ###############################################################################
68 ####### Load libraries etc.
69 ###############################################################################
70 library(stringr)
71 library(RUnit)
72
73 #############################################################
74 ####### Global values (should be parameters to module)
75 #############################################################
76
77 MIN_GOOD_PEPTIDE_LENGTH <- 11
78
79 #############################################################
80 ####### General purpose functions
81 #############################################################
82 # Creates a more useful error report when file is not reasonable
83 safe_file_exists <- function(file_path){ # Still not particularly useful in cases where it is a valid directory
84 tryCatch(
85 return(file_test(op = "-f", x=file_path)),
86 error=function(e) {simpleError(sprintf("file path is not valid: '%s'", file_path))}
87 )
88 }
89 # My standard way of loading data into data.frames
90 load_standard_df <- function(file_path=NULL){
91 clean_field_names = function(field_names){
92 result <- field_names
93 idx_blank <- which(result == "")
94 result[idx_blank] <- sprintf("<Field %d>", idx_blank)
95 return(result)
96 }
97 if (safe_file_exists(file_path)){
98 field_names <- read_field_names(file_path, sep = "\t")
99 field_names <- clean_field_names(field_names)
100 data <- read.table(file = file_path, header = TRUE, sep = "\t", stringsAsFactors = FALSE, blank.lines.skip = TRUE)#, check.names = FALSE)
101 colnames(data) = field_names
102 } else {
103 stop(sprintf("File path does not exist: '%s'", file_path))
104 }
105 return(data)
106 }
107 save_standard_df <- function(x=NULL, file_path=NULL){
108 if (file_path != ""){
109 write.table(x = x, file = file_path, quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE)
110 }
111 }
112 rename_column <- function(df=NULL, name_before=NULL, name_after=NULL, suppressWarnings=FALSE){
113 if (is.null(df)){
114 stop("Dataframe (df) does not exist - unable to rename column")
115 }
116 if (name_before %in% colnames(df)){
117 df[,name_after] <- df[,name_before]
118 df[,name_before] <- NULL
119 } else if (!suppressWarnings){
120 warning(sprintf("'%s' is not a field in the data frame and so has not been renamed", name_before))
121 }
122 return(df)
123 }
124 rename_columns <- function(df=NULL, names_before=NULL, names_after=NULL){
125 for (i in safe_iterator(length(names_before))){
126 df <- rename_column(df, names_before[i], names_after[i])
127 }
128 return(df)
129 }
130 round_to_tolerance <- function(x=NULL, tolerance=NULL, ...){
131 return(function_to_tolerance(x=x, tolerance=tolerance, FUN=round, ...))
132 }
133 function_to_tolerance <- function(x=NULL, tolerance=NULL, FUN=NULL, ...){
134 return(FUN(x/tolerance, ...) * tolerance)
135 }
136 safe_median <- function(x) median(x, na.rm=TRUE)
137 normalize_density <- function(d){
138 # Normalizes y-values in density function
139 # so that the integral under the curve is 1
140 # (uses rectangles to approximate area)
141 delta_x <- diff(range(d$x)) / length(d$x)
142 unnormalized_integral <- delta_x * sum(d$y)
143 new_d <- d
144 new_d$y <- with(new_d, y )
145
146 return(new_d)
147 }
148 if_null <- function(cond=NULL, null_result=NULL, not_null_result=NULL){
149 return(switch(1+is.null(cond),
150 not_null_result,
151 null_result))
152 }
153 rainbow_with_fixed_intensity <- function(n=NULL, goal_intensity_0_1=NULL, alpha=NULL){
154 goal_intensity <- 255*goal_intensity_0_1
155 hex_colors <- rainbow(n)
156 rgb_colors <- col2rgb(hex_colors)
157 df_colors <- data.frame(t(rgb_colors))
158 df_colors$intensity <- with(df_colors, 0.2989*red + 0.5870*green + 0.1140*blue)
159
160 df_colors$white_black <- with(df_colors, ifelse(intensity < goal_intensity, 255, 0))
161 df_colors$mix_level <- with(df_colors, (white_black - goal_intensity) / (white_black - intensity ) )
162 df_colors$new_red <- with(df_colors, mix_level*red + (1-mix_level)*white_black)
163 df_colors$new_green <- with(df_colors, mix_level*green + (1-mix_level)*white_black)
164 df_colors$new_blue <- with(df_colors, mix_level*blue + (1-mix_level)*white_black)
165 names_pref_new <- c("new_red", "new_green", "new_blue")
166 names_no_pref <- c("red", "green", "blue")
167 df_colors <- df_colors[,names_pref_new]
168 df_colors <- rename_columns(df_colors, names_before = names_pref_new, names_after = names_no_pref)
169 rgb_colors <-as.matrix(df_colors/255 )
170
171 return(rgb(rgb_colors, alpha=alpha))
172 }
173 safe_iterator <- function(n_steps = NULL){
174 if (n_steps < 1){
175 result = numeric(0)
176 } else {
177 result = 1:n_steps
178 }
179 return(result)
180 }
181 col2hex <- function(cols=NULL, col_alpha=255){
182 if (all(col_alpha<=1)){
183 col_alpha <- round(col_alpha*255)
184 }
185 col_matrix <- t(col2rgb(cols))
186 results <- rgb(col_matrix, alpha=col_alpha, maxColorValue = 255)
187 return(results)
188 }
189 credible_interval <- function(x=NULL, N=NULL, precision=0.001, alpha=0.05){
190 # Approximates "highest posterior density interval"
191 # Uses exact binomial but with a finite list of potential values (1/precision)
192
193 p <- seq(from=0, to=1, by=precision)
194 d <- dbinom(x = x, size = N, prob = p)
195 d <- d / sum(d)
196 df <- data.frame(p=p, d=d)
197 df <- df[order(-df$d),]
198 df$cumsum <- cumsum(df$d)
199 max_idx <- sum(df$cumsum < (1-alpha)) + 1
200 max_idx <- min(max_idx, nrow(df))
201
202 lower <- min(df$p[1:max_idx])
203 upper <- max(df$p[1:max_idx])
204
205 return(c(lower,upper))
206 }
207 verified_element_of_list <- function(parent_list=NULL, element_name=NULL, object_name=NULL){
208 if (is.null(parent_list[[element_name]])){
209 if (is.null(object_name)){
210 object_name = "the list"
211 }
212 stop(sprintf("Element '%s' does not yet exist in %s", element_name, object_name))
213 }
214 return(parent_list[[element_name]])
215 }
216 read_field_names = function(file_path=NULL, sep = "\t"){
217 con = file(file_path,"r")
218 fields = readLines(con, n=1)
219 close(con)
220 fields = strsplit(x = fields, split = sep)[[1]]
221 return(fields)
222 }
223 check_field_name = function(input_df = NULL, name_of_input_df=NULL, field_name=NULL){
224 test_succeeded <- field_name %in% colnames(input_df)
225 current_columns <- paste0(colnames(input_df), collapse=", ")
226 checkTrue(test_succeeded,
227 msg = sprintf("Expected fieldname '%s' in %s (but did not find it among %s)",
228 field_name, name_of_input_df, current_columns))
229 }
230
231 #############################################################
232 ####### Classes for Data
233 #############################################################
234
235 ###############################################################################
236 # Class: Data_Object
237 ###############################################################################
238 Data_Object <- setRefClass("Data_Object",
239 fields =list(m_is_dirty = "logical",
240 parents = "list",
241 children = "list",
242 class_name = "character"))
243 Data_Object$methods(
244 initialize = function(){
245 m_is_dirty <<- TRUE
246 class_name <<- "Data_Object <abstract class - class_name needs to be set in subclass>"
247 },
248 load_data = function(){
249 #print(sprintf("Calling %s$load_data()", class_name)) # Useful for debugging
250 ensure_parents()
251 verify()
252 m_load_data()
253 set_dirty(new_value = FALSE)
254 },
255 ensure = function(){
256 if (m_is_dirty){
257 load_data()
258 }
259 },
260 set_dirty = function(new_value){
261 if (new_value != m_is_dirty){
262 m_is_dirty <<- new_value
263 set_children_dirty()
264 }
265 },
266 verify = function(){
267 stop(sprintf("verify() is an abstract method - define it in %s before calling load_data()", class_name))
268 },
269 m_load_data = function(){
270 stop(sprintf("m_load_data() is an abstract method - define it in %s before calling load_data()", class_name))
271 },
272 append_parent = function(parent=NULL){
273 parents <<- append(parents, parent)
274 },
275 append_child = function(child=NULL){
276 children <<- append(children, child)
277 },
278 ensure_parents = function(){
279 for (parent in parents){
280 # print(sprintf("Calling %s$ensure()", parent$class_name)) # Useful for debugging
281 parent$ensure()
282 }
283 },
284 set_children_dirty = function(){
285 for (child in children){
286 child$set_dirty(TRUE)
287 }
288 }
289 )
290 ###############################################################################
291 # Class: Data_Object_Info
292 ###############################################################################
293 Data_Object_Info <- setRefClass("Data_Object_Info",
294 contains = "Data_Object",
295 fields =list(
296 data_file_name_1_percent_FDR = "character",
297 data_file_name = "character",
298 data_path_name = "character",
299 experiment_name = "character",
300 designation = "character",
301
302 m_input_file_type = "character",
303
304 score_field_name = "character"
305 #collection_name="character",
306 #dir_results="character",
307 #dir_dataset="character",
308 #dataset_designation="character",
309 #file_name_dataset="character",
310 #file_name_dataset_1_percent="character",
311 #experiment_name="character"
312 ) )
313 Data_Object_Info$methods(
314 initialize = function(){
315 callSuper()
316 class_name <<- "Data_Object_Info - <Abstract class - class_name needs to be set in subclass>"
317 },
318 verify = function(){
319 checkFieldExists = function(field_name=NULL){
320 field_value <- .self[[field_name]]
321 checkTrue(length(field_value) > 0,
322 sprintf("Field %s$%s has not been set (and should have been)", class_name, field_name))
323 checkTrue(length(field_value) == 1,
324 sprintf("Field %s$%s has been set to multiple values (and should be a single value)", class_name, field_name))
325 checkTrue(field_value != "",
326 sprintf("Field %s$%s has been set to an empty string (and should not have been)", class_name, field_name))
327 }
328 checkFieldExists("data_file_name")
329 checkFieldExists("data_path_name")
330 checkFieldExists("experiment_name")
331 checkFieldExists("designation")
332 checkFieldExists("m_input_file_type")
333 checkFieldExists("score_field_name")
334 },
335 m_load_data = function(){
336 # Nothing to do - this is really a data class
337 },
338 file_path = function(){
339 result <- file.path(data_path_name, data_file_name)
340 if (length(result) == 0){
341 stop("Unable to validate file path - one or both of path name and file name are missing")
342 }
343 return(result)
344 },
345 file_path_1_percent_FDR = function(){
346 local_file_name <- get_data_file_name_1_percent_FDR()
347 if (length(local_file_name) == 0){
348 result <- ""
349 } else {
350 result <- file.path(data_path_name, local_file_name)
351 }
352
353 # Continue even if file name is missing - not all analyses have a 1 percent FDR file; this is managed downstream
354
355 # if (length(result) == 0){
356 # stop("Unable to validate file path - one or both of path name and file name (of 1 percent FDR file) are missing")
357 # }
358 return(result)
359 },
360 get_data_file_name_1_percent_FDR = function(){
361 return(data_file_name_1_percent_FDR)
362 },
363 collection_name = function(){
364 result <- sprintf("%s_%s", experiment_name, designation)
365 return(result)
366 },
367 get_field_name_of_score = function(){
368 if (length(score_field_name) == 0){
369 stop("score_field_name has not been set for this project")
370 }
371 return(score_field_name)
372 },
373 set_input_file_type = function(p_input_file_type=NULL){
374 if (p_input_file_type == "PSM_Report"){
375 # do nothing, for now
376 }
377 else if (p_input_file_type == "PMD_FDR_input_file"){
378 score_field_name <<- "PMD_FDR_input_score"
379 }
380 else {
381 stop(sprintf("input_file_type ('%s') is not currently supported - file_type not changed", p_input_file_type))
382 }
383 m_input_file_type <<- p_input_file_type
384 },
385 get_input_file_type = function(){
386 if (length(m_input_file_type) == 0){
387 stop("input_file_type has not been set yet (null string)")
388 }
389 if (m_input_file_type == ""){
390 stop("input_file_type has not been set yet")
391 }
392
393 return(m_input_file_type)
394 }
395 )
396 ###############################################################################
397 # Class: Data_Object_Info_737_two_step
398 ###############################################################################
399 Data_Object_Info_737_two_step <- setRefClass("Data_Object_Info_737_two_step",
400 contains = "Data_Object_Info",
401 fields =list())
402 Data_Object_Info_737_two_step$methods(
403 initialize = function(){
404 callSuper()
405 class_name <<- "Data_Object_Info_737_two_step"
406 score_field_name <<- "Confidence [%]"
407 data_file_name_1_percent_FDR <<- "737_NS_Peptide_Shaker_PSM_Report_Multi_Stage_Two_Step.tabular"
408 data_file_name <<- "737_NS_Peptide_Shaker_Extended_PSM_Report_Multi_Stage_Two_Step.tabular.tabular"
409 data_path_name <<- file.path(".", "Data")
410 experiment_name <<- "Oral_737_NS"
411 designation <<- "two_step"
412
413 set_input_file_type("PSM_Report")
414
415 #data_collection_oral_737_NS_combined$file_name_dataset_1_percent = "737_NS_Peptide_Shaker_PSM_Report_CombinedDB.tabular"
416 #data_collection_oral_737_NS_two_step$file_name_dataset_1_percent = "737_NS_Peptide_Shaker_PSM_Report_Multi_Stage_Two_Step.tabular"
417
418 }
419 )
420
421 ###############################################################################
422 # Class: Data_Object_Info_737_combined
423 ###############################################################################
424 Data_Object_Info_737_combined <- setRefClass("Data_Object_Info_737_combined",
425 contains = "Data_Object_Info",
426 fields =list())
427 Data_Object_Info_737_combined$methods(
428 initialize = function(){
429 callSuper()
430 class_name <<- "Data_Object_Info_737_combined"
431 score_field_name <<- "Confidence [%]"
432 data_file_name_1_percent_FDR <<- "737_NS_Peptide_Shaker_PSM_Report_CombinedDB.tabular"
433 data_file_name <<- "737_NS_Peptide_Shaker_Extended_PSM_Report_CombinedDB.tabular"
434 data_path_name <<- file.path(".", "Data")
435 experiment_name <<- "Oral_737_NS"
436 designation <<- "two_step"
437
438 set_input_file_type("PSM_Report")
439
440 #data_collection_oral_737_NS_combined$file_name_dataset_1_percent = "737_NS_Peptide_Shaker_PSM_Report_CombinedDB.tabular"
441 #data_collection_oral_737_NS_two_step$file_name_dataset_1_percent = "737_NS_Peptide_Shaker_PSM_Report_Multi_Stage_Two_Step.tabular"
442
443 }
444 )
445
446 ###############################################################################
447 # Class: Data_Object_Pyrococcus_tr
448 ###############################################################################
449 Data_Object_Pyrococcus_tr <- setRefClass("Data_Object_Pyrococcus_tr",
450 contains = "Data_Object_Info",
451 fields =list())
452 Data_Object_Pyrococcus_tr$methods(
453 initialize = function(){
454 callSuper()
455 class_name <<- "Data_Object_Pyrococcus_tr"
456 score_field_name <<- "Confidence [%]"
457 data_file_name_1_percent_FDR <<- ""
458 data_file_name <<- "Pfu_traditional_Extended_PSM_Report.tabular"
459 data_path_name <<- file.path(".", "Data")
460 experiment_name <<- "Pyrococcus"
461 designation <<- "tr"
462
463 set_input_file_type("PSM_Report")
464
465 }
466 )
467 ###############################################################################
468 # Class: Data_Object_Mouse_Mutations
469 ###############################################################################
470 Data_Object_Mouse_Mutations <- setRefClass("Data_Object_Mouse_Mutations",
471 contains = "Data_Object_Info",
472 fields =list())
473 Data_Object_Mouse_Mutations$methods(
474 initialize = function(){
475 callSuper()
476 class_name <<- "Data_Object_Mouse_Mutations"
477 score_field_name <<- "Confidence [%]"
478 data_file_name_1_percent_FDR <<- ""
479 data_file_name <<- "Combined_DB_Mouse_5PTM.tabular"
480 data_path_name <<- file.path(".", "Data")
481 experiment_name <<- "Mouse Mutations"
482 designation <<- "combined_05"
483
484 set_input_file_type("PSM_Report")
485
486 }
487 )
488 ###############################################################################
489 # Class: Data_Object_Raw_Data
490 ###############################################################################
491 Data_Object_Raw_Data <- setRefClass("Data_Object_Raw_Data",
492 contains = "Data_Object",
493 fields =list(df = "data.frame"))
494 Data_Object_Raw_Data$methods(
495 initialize = function(){
496 callSuper()
497 class_name <<- "Data_Object_Raw_Data"
498 },
499 verify = function(){
500 # Check that file exists before using it
501 file_path <- get_info()$file_path()
502 if (! safe_file_exists(file_path)){
503 stop(sprintf("Raw data file does not exist (%s)", file_path))
504 }
505 },
506 set_info = function(info){
507 parents[["info"]] <<- info
508 },
509 get_info = function(){
510 return(verified_element_of_list(parents, "info", "Data_Object_Raw_Data$parents"))
511 },
512 m_load_data = function(){
513 info <- get_info()
514 df <<- load_standard_df(info$file_path())
515 }
516 )
517 ###############################################################################
518 # Class: Data_Object_Raw_1_Percent
519 ###############################################################################
520 Data_Object_Raw_1_Percent <- setRefClass("Data_Object_Raw_1_Percent",
521 contains = "Data_Object",
522 fields =list(df = "data.frame"))
523 Data_Object_Raw_1_Percent$methods(
524 initialize = function(){
525 callSuper()
526 class_name <<- "Data_Object_Raw_1_Percent"
527 },
528 set_info = function(info){
529 parents[["info"]] <<- info
530 },
531 verify = function(){
532 # Do nothing - a missing file name is acceptable for this module and is dealt with in load()
533 },
534 get_info = function(){
535 return(verified_element_of_list(parents, "info", "Data_Object_Raw_1_Percent$parents"))
536 },
537 m_load_data = function(){
538
539 info <- get_info()
540 file_path <- info$file_path_1_percent_FDR()
541 if (exists()){
542 df <<- load_standard_df(info$file_path_1_percent_FDR())
543 } # Note that failing to load is a valid state for this file, leading to not is_dirty. BUGBUG: this could lead to problems if a good file appears later
544 },
545 exists = function(){
546
547 info <- get_info()
548 local_file_name <- info$get_data_file_name_1_percent_FDR() # Check file name not file path
549
550 if (length(local_file_name) == 0 ){ # variable not set
551 result = FALSE
552 } else if (local_file_name == ""){ # variable set to empty string
553 result = FALSE
554 } else {
555 result = safe_file_exists(info$file_path_1_percent_FDR())
556 }
557
558 return(result)
559 }
560 )
561 ###############################################################################
562 # Class: Data_Object_Data_Converter
563 ###############################################################################
564 Data_Object_Data_Converter <- setRefClass("Data_Object_Data_Converter",
565 contains = "Data_Object",
566 fields =list(df = "data.frame"))
567 Data_Object_Data_Converter$methods(
568 initialize = function(){
569 callSuper()
570 class_name <<- "Data_Object_Data_Converter"
571 },
572 currently_supported_file_types = function(){
573 return(c("PSM_Report", "PMD_FDR_input_file"))
574 },
575 verify = function(){
576 checkFileTypeSupported = function(){
577 file_type <- get_info()$get_input_file_type()
578
579 supported_list <- paste0(currently_supported_file_types(), collapse = " ")
580 checkTrue(file_type %in% currently_supported_file_types(),
581 sprintf("File type (%s) is not currently supported in Data_Object_Data_Converter (currently support: %s)",
582 file_type, supported_list))
583 }
584 info <- get_info()
585 raw_data <- get_raw_data()
586
587 file_type <- info$get_input_file_type()
588
589 checkFileTypeSupported()
590
591 data_original <- raw_data$df
592 if (file_type == "PSM_Report"){
593 check_field_name(data_original, "raw_data", info$get_field_name_of_score())
594 check_field_name(data_original, "raw_data", "Precursor m/z Error [ppm]")
595 check_field_name(data_original, "raw_data", "Spectrum File")
596 check_field_name(data_original, "raw_data", "Protein(s)")
597 check_field_name(data_original, "raw_data", "Spectrum Title")
598 check_field_name(data_original, "raw_data", "Decoy")
599 check_field_name(data_original, "raw_data", "Sequence")
600 }
601 if (file_type == "PMD_FDR_input_file"){
602 check_field_name(data_original, "raw_data", "PMD_FDR_input_score")
603 check_field_name(data_original, "raw_data", "PMD_FDR_pmd")
604 check_field_name(data_original, "raw_data", "PMD_FDR_spectrum_file")
605 check_field_name(data_original, "raw_data", "PMD_FDR_proteins")
606 check_field_name(data_original, "raw_data", "PMD_FDR_spectrum_title")
607 check_field_name(data_original, "raw_data", "PMD_FDR_sequence")
608 check_field_name(data_original, "raw_data", "PMD_FDR_decoy")
609 }
610
611 # if (file_type == "PSM_Report"){
612 # data_new <- raw_data$df
613 # field_name_of_score <- info$get_field_name_of_score()
614 # data_new$PMD_FDR_input_score <- data_new[, field_name_of_score ]
615 # data_new$PMD_FDR_pmd <- data_new[, "Precursor m/z Error [ppm]"]
616 # data_new$PMD_FDR_spectrum_file <- data_new[, "Spectrum File" ]
617 # data_new$PMD_FDR_proteins <- data_new[, "Protein(s)" ]
618 # data_new$PMD_FDR_spectrum_title <- data_new[, "Spectrum Title" ]
619 # }
620
621 },
622 m_load_data = function(){
623
624 info <- get_info()
625 raw_data <- get_raw_data()
626
627 file_type <- info$get_input_file_type()
628
629 if (file_type == "PSM_Report"){
630 data_new <- raw_data$df
631 field_name_of_score <- info$get_field_name_of_score()
632 data_new$PMD_FDR_input_score <- data_new[, field_name_of_score ]
633 data_new$PMD_FDR_pmd <- data_new[, "Precursor m/z Error [ppm]"]
634 data_new$PMD_FDR_spectrum_file <- data_new[, "Spectrum File" ]
635 data_new$PMD_FDR_proteins <- data_new[, "Protein(s)" ]
636 data_new$PMD_FDR_spectrum_title <- data_new[, "Spectrum Title" ]
637 data_new$PMD_FDR_sequence <- data_new[, "Sequence" ]
638 data_new$PMD_FDR_decoy <- data_new[, "Decoy" ]
639
640 df <<- data_new
641 }
642 else if (file_type == "PMD_FDR_input_file"){
643 df <<- raw_data$df
644 }
645 else {
646 stop(sprintf("Input file type is not currently supported: '%s'", file_type))
647 }
648
649 },
650 set_info = function(info){
651 parents[["info"]] <<- info
652 },
653 get_info = function(){
654 return(verified_element_of_list(parents, "info", "Data_Object_Data_Converter$parents"))
655 },
656 set_raw_data = function(raw_data){
657 parents[["raw_data"]] <<- raw_data
658 },
659 get_raw_data = function(){
660 return(verified_element_of_list(parents, "raw_data", "Data_Object_Data_Converter$parents"))
661 }
662 )
663 ###############################################################################
664 # Class: Data_Object_Groupings
665 ###############################################################################
666 Data_Object_Groupings <- setRefClass("Data_Object_Groupings",
667 contains = "Data_Object",
668 fields =list(df = "data.frame"))
669 Data_Object_Groupings$methods(
670 initialize = function(){
671 callSuper()
672 class_name <<- "Data_Object_Groupings"
673 },
674 simplify_field_name = function(x=NULL){
675 result <- gsub(pattern = "PMD_FDR_", replacement = "", x = x)
676 return(result)
677 },
678 verify = function(){
679 data_original <- get_data_converter()$df
680
681 check_field_name(data_original, "data_converter", "PMD_FDR_input_score")
682 check_field_name(data_original, "data_converter", "PMD_FDR_pmd")
683 check_field_name(data_original, "data_converter", "PMD_FDR_spectrum_file")
684 check_field_name(data_original, "data_converter", "PMD_FDR_proteins")
685 check_field_name(data_original, "data_converter", "PMD_FDR_spectrum_title")
686 check_field_name(data_original, "data_converter", "PMD_FDR_sequence")
687 check_field_name(data_original, "data_converter", "PMD_FDR_decoy")
688
689 },
690 m_load_data = function(){
691 make_data_groups <- function(data_original=NULL){
692
693 # Functions supporting make_data_groups()
694
695 standardize_fields <- function(data=NULL){
696 data_new <- data
697
698 info <- get_info()
699 info$ensure()
700 field_name_of_score <- info$get_field_name_of_score()
701
702 # #data_new <- rename_column(data_new, "Variable Modifications" , "ptm_list")
703 # data_new <- rename_column(data_new, field_name_of_score , "PMD_FDR_input_score")
704 # data_new <- rename_column(data_new, "Precursor m/z Error [ppm]", "PMD_FDR_pmd")
705 # #data_new <- rename_column(data_new, "Isotope Number" , "isotope_number")
706 # #data_new <- rename_column(data_new, "m/z" , "m_z")
707 # #data_new <- rename_column(data_new, "Measured Charge" , "charge")
708 # data_new <- rename_column(data_new, "Spectrum File" , "PMD_FDR_spectrum_file")
709 # data_new <- rename_column(data_new, "Protein(s)" , "PMD_FDR_proteins")
710 # data_new <- rename_column(data_new, "Spectrum Title" , "PMD_FDR_spectrum_title")
711 # data_new <- manage_decoy_column(data_new)
712
713 # Now managed in Data_Converter
714 # data_new$PMD_FDR_input_score <- data_new[, field_name_of_score ]
715 # data_new$PMD_FDR_pmd <- data_new[, "Precursor m/z Error [ppm]"]
716 # data_new$PMD_FDR_spectrum_file <- data_new[, "Spectrum File" ]
717 # data_new$PMD_FDR_proteins <- data_new[, "Protein(s)" ]
718 # data_new$PMD_FDR_spectrum_title <- data_new[, "Spectrum Title" ]
719
720 data_new$value <- data_new$PMD_FDR_pmd
721 data_new$PMD_FDR_peptide_length <- str_length(data_new$PMD_FDR_sequence)
722 #data_new$charge_value <- with(data_new, as.numeric(substr(charge, start=1, stop=str_length(charge)-1)))
723 #data_new$measured_mass <- with(data_new, m_z*charge_value)
724 data_new$PMD_FDR_spectrum_index <- NA
725 data_new$PMD_FDR_spectrum_index[order(data_new$PMD_FDR_spectrum_title, na.last = TRUE)] <- 1:nrow(data_new)
726
727 return(data_new)
728 }
729 add_grouped_variable <- function(data_groups = data_groups, field_name_to_group = NULL, vec.length.out = NULL, vec.tolerance = NULL, value_format = NULL){
730
731 # Support functions for add_grouped_variable()
732 find_interval_vec <- function(x=NULL, length.out = NULL, tolerance = NULL){
733 q <- quantile(x = x, probs = seq(from=0, to=1, length.out = length.out), na.rm=TRUE)
734 q <- round_to_tolerance(q, tolerance = tolerance)
735 return(q)
736 }
737 get_group_data_frame <- function(vec=NULL, value_format = NULL){
738 n <- length(vec)
739 a <- vec[-n]
740 b <- vec[-1]
741
742 lower <- ifelse(a == b , "eq", NA)
743 lower <- ifelse(is.na(lower ), "ge", lower)
744 upper <- ifelse(a == b , "eq", NA)
745 upper[n-1] <- ifelse(is.na(upper[n-1]), "le", "eq")
746 upper <- ifelse(is.na(upper ), "lt", upper)
747 group <- data.frame(list(idx=1:(n-1), a=a, b=b, lower=lower, upper=upper))
748
749 name_format <- sprintf("%%%s_%%%s_%%s_%%s", value_format, value_format)
750 group$new_var <- with(group, sprintf(name_format, a, b, lower, upper))
751
752 return(group)
753 }
754 merge_group_with_data <- function(data_groups = NULL, group = NULL, vec = NULL, field_name_to_group = NULL){
755 field_name_new <- sprintf("group_%s", simplify_field_name(field_name_to_group))
756 group_idx <- findInterval(x = data_groups[,field_name_to_group],
757 vec = vec,
758 all.inside=TRUE)
759 data_groups$new_var <- group$new_var[group_idx]
760 data_groups <- rename_column(data_groups, "new_var", field_name_new)
761 }
762 # Body of add_grouped_variable()
763
764 vec <- find_interval_vec(x = data_groups[[field_name_to_group]],
765 length.out = vec.length.out,
766 tolerance = vec.tolerance )
767 group <- get_group_data_frame(vec = vec,
768 value_format = value_format)
769 df_new <- merge_group_with_data(data_groups = data_groups,
770 group = group,
771 vec = vec,
772 field_name_to_group = field_name_to_group)
773 df_new <- add_group_decoy(df_new, field_name_to_group)
774
775 return(df_new)
776 }
777 add_already_grouped_variable <- function(field_name_to_group = NULL, data_groups = NULL ){
778 old_name <- field_name_to_group
779 new_name <- sprintf("group_%s", simplify_field_name(old_name))
780 df_new <- data_groups
781 df_new[[new_name]] <- data_groups[[old_name]]
782
783 df_new <- add_group_decoy(data_groups = df_new, field_name_to_group = field_name_to_group)
784
785 return(df_new)
786 }
787 add_value_norm <- function(data_groups = NULL){
788
789 df_new <- data_groups
790 df_new$value_norm <- with(df_new, value - median_of_group_index)
791
792 return(df_new)
793 }
794 add_protein_group <-function(data_groups = NULL){
795 data_new <- data_groups
796 df_group_def <- data.frame(stringsAsFactors = FALSE,
797 list(pattern = c("" , "pfu_" , "cRAP"),
798 group_name = c("human", "pyrococcus", "contaminant")))
799 for (i in 1:nrow(df_group_def)){
800 idx <- grepl(pattern = df_group_def$pattern[i],
801 x = data_new$PMD_FDR_proteins)
802 data_new$group_proteins[idx] <- df_group_def$group_name[i]
803 }
804
805 data_new <- add_group_decoy(data_groups = data_new, field_name_to_group = "PMD_FDR_proteins")
806 return(data_new)
807 }
808 add_group_decoy <- function(data_groups=NULL, field_name_to_group=NULL){
809 simple_field_name <- simplify_field_name(field_name_to_group)
810 field_name_decoy <- sprintf("group_decoy_%s", simple_field_name)
811 field_name_group <- sprintf("group_%s", simple_field_name)
812
813 data_groups[[field_name_decoy]] <- with(data_groups, ifelse(PMD_FDR_decoy, "decoy", data_groups[[field_name_group]]))
814
815 return(data_groups)
816 }
817 add_group_training_class <- function(data_groups = NULL){
818 df_new <- data_groups
819
820 lowest_confidence_group <- min(data_groups$group_input_score)
821
822 is_long_enough <- with(df_new, (PMD_FDR_peptide_length >= MIN_GOOD_PEPTIDE_LENGTH) )
823 is_good <- with(df_new, (PMD_FDR_decoy == 0) & (PMD_FDR_input_score == 100) )
824 is_bad <- with(df_new, (PMD_FDR_decoy == 1) )
825 #is_used_to_train <- with(df_new, used_to_find_middle) # BUGBUG: circular definition
826
827 idx_good <- which(is_good ) # & is_long_enough)
828 n_good <- length(idx_good)
829 idx_testing <- idx_good[c(TRUE,FALSE)] # Selects every other item
830 idx_training <- setdiff(idx_good, idx_testing)
831
832 #is_good_short <- with(df_new, is_good & !is_long_enough )
833 #is_good_long <- with(df_new, is_good & is_long_enough )
834 is_bad_short <- with(df_new, is_bad & !is_long_enough )
835 is_bad_long <- with(df_new, is_bad & is_long_enough )
836 #is_good_training <- with(df_new, is_good_long & (used_to_find_middle == TRUE ) )
837 #is_good_testing <- with(df_new, is_good_long & (used_to_find_middle == FALSE) )
838
839 df_new$group_training_class <- "other_short" # Default
840 df_new$group_training_class[is_long_enough ] <- "other_long" # Default (if long enough)
841 df_new$group_training_class[idx_training ] <- "good_training" # Length does not matter (anymore)
842 df_new$group_training_class[idx_testing ] <- "good_testing" # Ditto
843 #df_new$group_training_class[is_good_short ] <- "good_short"
844 df_new$group_training_class[is_bad_long ] <- "bad_long" # ...except for "bad"
845 df_new$group_training_class[is_bad_short ] <- "bad_short"
846
847 df_new <- add_used_to_find_middle( data_groups = df_new ) # Guarantees consistency between duplicated definitions
848
849 return(df_new)
850 }
851 add_used_to_find_middle <- function(data_groups = NULL){
852 df_new <- data_groups
853 idx_used <- which(data_groups$group_training_class == "good_training")
854
855 df_new$used_to_find_middle <- FALSE
856 df_new$used_to_find_middle[idx_used] <- TRUE
857
858 return(df_new)
859 }
860 add_group_spectrum_index <- function(data_groups = NULL){
861
862 # Supporting functions for add_group_spectrum_index()
863
864 get_breaks_all <- function(df_new){
865 # Supporting function(s) for get_breaks_all()
866
867 get_cut_points <- function(data_subset){
868
869 # Supporting function(s) for get_cut_points()
870
871 cut_values <- function(data=NULL, minimum_segment_length=NULL){
872 # using cpt.mean -- Appears to have a memory leak
873 #results_cpt <- cpt.mean(data=data, method="PELT", minimum_segment_length=minimum_segment_length)
874 #results <- results_cpt@cpts
875
876 # Just look at the end
877 #results <- c(length(data))
878
879 # regularly spaced, slightly larger than minimum_segment_length
880 n_points <- length(data)
881 n_regions <- floor(n_points / minimum_segment_length)
882 n_regions <- ifelse(n_regions == 0, 1, n_regions)
883 results <- round(seq(1, n_points, length.out = n_regions + 1))
884 results <- results[-1]
885 return(results)
886 }
887 remove_last <- function(x){
888 return(x[-length(x)] )
889 }
890
891 # Main code of for get_cut_points()
892 max_idx = max(data_subset$PMD_FDR_spectrum_index)
893 data_sub_sub <- subset(data_subset, group_training_class == "good_training") #(PMD_FDR_input_score==100) & (PMD_FDR_decoy==0))
894 minimum_segment_length = 50
895
896 values <- data_sub_sub$value
897 n_values <- length(values)
898 local_to_global_idx <- data_sub_sub$PMD_FDR_spectrum_index
899 if (n_values <= minimum_segment_length){
900 result <- c()
901 } else {
902 local_idx <- cut_values(data=values, minimum_segment_length=minimum_segment_length)
903 result <- local_to_global_idx[local_idx]
904 result <- remove_last(result)
905 }
906 result <- c(result, max_idx)
907 return(result)
908 }
909 remove_last <- function(vec) {
910 return(vec[-length(vec)])
911 }
912
913 # Main code of get_breaks_all()
914
915 breaks <- 1
916
917 files <- unique(df_new$PMD_FDR_spectrum_file)
918
919 for (local_file in files){
920 data_subset <- subset(df_new, (PMD_FDR_spectrum_file==local_file))
921 if (nrow(data_subset) > 0){
922 breaks <- c(breaks, get_cut_points(data_subset))
923 }
924 }
925 breaks <- sort(unique(breaks))
926 breaks <- remove_last(breaks)
927 breaks <- c(breaks, max(df_new$PMD_FDR_spectrum_index + 1))
928
929 return(breaks)
930 }
931
932 # Main code of add_group_spectrum_index()
933
934 field_name_to_group <- "PMD_FDR_spectrum_index"
935
936 df_new <- data_groups[order(data_groups[[field_name_to_group]]),]
937 breaks <- get_breaks_all(df_new)
938
939 df_new$group_spectrum_index <- cut(x = df_new[[field_name_to_group]], breaks = breaks, right = FALSE, dig.lab = 6)
940 df_new <- add_group_decoy(data_groups = df_new, field_name_to_group = field_name_to_group)
941
942 return(df_new)
943 }
944 add_median_of_group_index <-function(data_groups = NULL){
945 field_median <- "median_of_group_index"
946 data_good <- subset(data_groups, used_to_find_middle )
947 med <- aggregate(value~group_spectrum_index, data=data_good, FUN=safe_median)
948 med <- rename_column(med, "value", field_median)
949
950 data_groups[[field_median]] <- NULL
951 df_new <- merge(data_groups, med)
952
953 return(df_new)
954 }
955 add_1_percent_to_data_groups <- function(data_groups=NULL){
956
957 data_new <- data_groups
958
959 if (get_raw_1_percent()$exists()){
960 # Load 1 percent file
961 df_1_percent <- get_raw_1_percent()$df
962
963 # Get relevant fields
964 df_1_percent$is_in_1percent <- TRUE
965 df_1_percent <- rename_column(df_1_percent, "Spectrum Title", "PMD_FDR_spectrum_title")
966 df_1_percent <- df_1_percent[,c("PMD_FDR_spectrum_title", "is_in_1percent")]
967
968 # Merge with data_groups
969 data_new <- merge(data_new, df_1_percent, all.x=TRUE)
970 data_new$is_in_1percent[is.na(data_new$is_in_1percent)] <- FALSE
971 }
972
973 # Save results
974 return(data_new)
975
976 }
977
978
979 # Main code of make_data_groups()
980 data_groups <- standardize_fields(data_original)
981
982 data_groups <- add_grouped_variable(field_name_to_group = "PMD_FDR_input_score",
983 data_groups = data_groups,
984 vec.length.out = 14,
985 vec.tolerance = 1,
986 value_format = "03d")
987
988 data_groups <- add_grouped_variable(field_name_to_group = "PMD_FDR_pmd",
989 data_groups = data_groups,
990 vec.length.out = 21,
991 vec.tolerance = 0.1,
992 value_format = "+05.1f")
993
994 data_groups <- add_grouped_variable(field_name_to_group = "PMD_FDR_peptide_length",
995 data_groups = data_groups,
996 vec.length.out = 11,
997 vec.tolerance = 1,
998 value_format = "02d")
999
1000 # data_groups <- add_grouped_variable(field_name_to_group = "m_z",
1001 # data_groups = data_groups,
1002 # vec.length.out = 11,
1003 # vec.tolerance = 10,
1004 # value_format = "04.0f")
1005 #
1006 # data_groups <- add_grouped_variable(field_name_to_group = "measured_mass",
1007 # data_groups = data_groups,
1008 # vec.length.out = 11,
1009 # vec.tolerance = 1,
1010 # value_format = "04.0f")
1011 #
1012 # data_groups <- add_already_grouped_variable(field_name_to_group = "isotope_number",
1013 # data_groups = data_groups )
1014 #
1015 # data_groups <- add_already_grouped_variable(field_name_to_group = "charge",
1016 # data_groups = data_groups )
1017 #
1018 data_groups <- add_already_grouped_variable(field_name_to_group = "PMD_FDR_spectrum_file",
1019 data_groups = data_groups )
1020 data_groups <- add_protein_group(data_groups = data_groups)
1021 data_groups <- add_group_training_class( data_groups = data_groups)
1022 data_groups <- add_group_spectrum_index( data_groups = data_groups)
1023 data_groups <- add_median_of_group_index( data_groups = data_groups)
1024 data_groups <- add_value_norm( data_groups = data_groups)
1025
1026 # fields_of_interest <- c("PMD_FDR_input_score", "PMD_FDR_pmd", "m_z", "PMD_FDR_peptide_length", "isotope_number", "charge", "PMD_FDR_spectrum_file", "measured_mass", "PMD_FDR_spectrum_index", "PMD_FDR_proteins")
1027 # fields_of_interest <- c("value",
1028 # "PMD_FDR_decoy",
1029 # "PMD_FDR_spectrum_title",
1030 # "median_of_group_index",
1031 # "value_norm",
1032 # "used_to_find_middle",
1033 # "group_training_class",
1034 # fields_of_interest,
1035 # sprintf("group_%s" , fields_of_interest),
1036 # sprintf("group_decoy_%s", fields_of_interest))
1037
1038 fields_of_interest <- c("PMD_FDR_input_score", "PMD_FDR_pmd", "PMD_FDR_peptide_length", "PMD_FDR_spectrum_file", "PMD_FDR_spectrum_index", "PMD_FDR_proteins")
1039 fields_of_interest <- c("value",
1040 "PMD_FDR_decoy",
1041 "PMD_FDR_spectrum_title",
1042 "median_of_group_index",
1043 "value_norm",
1044 "used_to_find_middle",
1045 "group_training_class",
1046 fields_of_interest,
1047 sprintf("group_%s" , simplify_field_name(fields_of_interest)),
1048 sprintf("group_decoy_%s", simplify_field_name(fields_of_interest)))
1049
1050 data_groups <- data_groups[,fields_of_interest]
1051 data_groups <- add_1_percent_to_data_groups(data_groups)
1052
1053 return(data_groups)
1054 }
1055
1056 data_original <- get_data_converter()$df #parents[[INDEX_OF_ORIGINAL_DATA]]$df
1057 df <<- make_data_groups(data_original)
1058 },
1059 set_info = function(info){
1060 parents[["info"]] <<- info
1061 },
1062 get_info = function(){
1063 return(verified_element_of_list(parents, "info", "Data_Object_Groupings$parents"))
1064 },
1065 set_data_converter = function(data_converter){
1066 parents[["data_converter"]] <<- data_converter
1067 },
1068 get_data_converter = function(){
1069 return(verified_element_of_list(parents, "data_converter", "Data_Object_Groupings$parents"))
1070 },
1071 set_raw_1_percent = function(raw_1_percent){ ############## BUGBUG: the 1% file should be using the same file type format as the standard data (but isn't)
1072 parents[["raw_1_percent"]] <<- raw_1_percent
1073 },
1074 get_raw_1_percent = function(){
1075 return(verified_element_of_list(parents, "raw_1_percent", "Data_Object_Groupings$parents"))
1076 }
1077 )
1078 ###############################################################################
1079 # Class: Data_Object_Individual_FDR
1080 ###############################################################################
1081 Data_Object_Individual_FDR <- setRefClass("Data_Object_Individual_FDR",
1082 contains = "Data_Object",
1083 fields =list(df = "data.frame"))
1084 Data_Object_Individual_FDR$methods(
1085 initialize = function(){
1086 callSuper()
1087 class_name <<- "Data_Object_Individual_FDR"
1088 },
1089 verify = function(){
1090 data_groups = get_data_groups()$df
1091 densities = get_densities()$df
1092 alpha = get_alpha()$df
1093
1094 check_field_name(data_groups, "data_groups", "value_norm")
1095 check_field_name(data_groups, "data_groups", "group_decoy_input_score")
1096 check_field_name(data_groups, "data_groups", "PMD_FDR_peptide_length")
1097 check_field_name(data_groups, "data_groups", "PMD_FDR_input_score")
1098 check_field_name(alpha, "alpha", "alpha") # BUGBUG: I'm missing a field here...
1099 check_field_name(densities, "densities", "x")
1100 check_field_name(densities, "densities", "t")
1101 check_field_name(densities, "densities", "f")
1102
1103 },
1104 set_data_groups = function(parent){
1105 parents[["data_groups"]] <<- parent
1106 },
1107 get_data_groups = function(){
1108 return(verified_element_of_list(parents, "data_groups", "Data_Object_Individual_FDR$parents"))
1109 },
1110 set_densities = function(parent){
1111 parents[["densities"]] <<- parent
1112 },
1113 get_densities = function(){
1114 return(verified_element_of_list(parents, "densities", "Data_Object_Individual_FDR$parents"))
1115 },
1116 set_alpha = function(parent){
1117 parents[["alpha"]] <<- parent
1118 },
1119 get_alpha = function(){
1120 return(verified_element_of_list(parents, "alpha", "Data_Object_Individual_FDR$parents"))
1121 },
1122 m_load_data = function(){
1123 add_FDR_to_data_groups <- function(data_groups=NULL, densities=NULL, alpha=NULL, field_value=NULL, field_decoy_group=NULL, set_decoy_to_1=FALSE){
1124 # Support functions for add_FDR_to_data_groups()
1125 get_group_fdr <- function(group_stats = NULL, data_groups = NULL, densities=NULL){
1126 group_fdr <- apply(X = densities, MARGIN = 2, FUN = max)
1127 df_group_fdr <- data.frame(group_fdr)
1128 df_group_fdr <- rename_column(df_group_fdr, "group_fdr", "v")
1129 df_group_fdr$group_of_interest <- names(group_fdr)
1130 t <- df_group_fdr[df_group_fdr$group_of_interest == "t", "v"]
1131 f <- df_group_fdr[df_group_fdr$group_of_interest == "f", "v"]
1132 df_group_fdr <- subset(df_group_fdr, !(group_of_interest %in% c("x", "t", "f")))
1133 df_group_fdr$group_fdr <-(df_group_fdr$v - t) / (f - t)
1134
1135 return(df_group_fdr)
1136 }
1137
1138 get_mode <- function(x){
1139 d <- density(x)
1140 return(d$x[which.max(d$y)])
1141 }
1142
1143 # Main code for add_FDR_to_data_groups()
1144
1145 # Set up analysis
1146 data_new <- data_groups
1147 data_new$value_of_interest <- data_new[,field_value]
1148 data_new$group_of_interest <- data_new[,field_decoy_group]
1149
1150 data_subset <- subset(data_new, PMD_FDR_peptide_length >= 11)
1151
1152 # Identify mean PMD_FDR_input_score per group
1153
1154 group_input_score <- aggregate(PMD_FDR_input_score~group_of_interest, data=data_subset, FUN=mean)
1155 group_input_score <- rename_column(group_input_score, "PMD_FDR_input_score", "group_input_score")
1156
1157 #group_fdr <- get_group_fdr(data_groups = data_subset, densities=densities)
1158 group_stats <- merge(alpha, group_input_score)
1159 group_stats <- subset(group_stats, group_of_interest != "PMD_FDR_decoy")
1160
1161 x=c(0,group_stats$group_input_score)
1162 y=c(1,group_stats$alpha)
1163 FUN_interp <- approxfun(x=x,y=y)
1164
1165 data_new$interpolated_groupwise_FDR <- FUN_interp(data_new$PMD_FDR_input_score)
1166 if (set_decoy_to_1){
1167 data_new$interpolated_groupwise_FDR[data_new$PMD_FDR_decoy == 1] <- 1
1168 }
1169
1170 return(data_new)
1171 }
1172
1173 data_groups = get_data_groups()$df
1174 densities = get_densities()$df
1175 alpha = get_alpha()$df
1176
1177 d_true <- densities[,c("x", "t")]
1178 d_false <- densities[,c("x", "f")]
1179
1180 i_fdr <- add_FDR_to_data_groups(data_groups = data_groups,
1181 densities = densities,
1182 alpha = alpha,
1183 field_value ="value_norm",
1184 field_decoy_group = "group_decoy_input_score")
1185 # Derive local t
1186 interp_t <- splinefun(x=d_true$x, y=d_true$t) #approxfun(x=d_true$x, y=d_true$y)
1187
1188 # Derive local f
1189 interp_f <- splinefun(x=d_false$x, y=d_false$f) #approxfun(x=d_true$x, y=d_true$y)
1190
1191 # Derive local FDR
1192 i_fdr$t <- interp_t(i_fdr$value_of_interest)
1193 i_fdr$f <- interp_f(i_fdr$value_of_interest)
1194 i_fdr$alpha <- i_fdr$interpolated_groupwise_FDR
1195 i_fdr$i_fdr <- with(i_fdr, (alpha*f) / (alpha*f + (1-alpha)*t))
1196
1197 df <<- i_fdr
1198
1199 }
1200 )
1201 ###############################################################################
1202 # Class: Data_Object_Densities
1203 ###############################################################################
1204 Data_Object_Densities <- setRefClass("Data_Object_Densities",
1205 contains = "Data_Object",
1206 fields =list(df = "data.frame"))
1207 Data_Object_Densities$methods(
1208 initialize = function(){
1209 callSuper()
1210 class_name <<- "Data_Object_Densities"
1211 },
1212 verify = function(){
1213 df_data_groups <- get_data_groups()$df
1214
1215 checkTrue(nrow(df_data_groups) > 0,
1216 msg = "data_groups data frame was empty (and should not have been)")
1217
1218 check_field_name(df_data_groups, "data_groups", "value_norm")
1219 check_field_name(df_data_groups, "data_groups", "group_decoy_input_score")
1220 check_field_name(df_data_groups, "data_groups", "group_training_class")
1221 },
1222 set_data_groups = function(parent=NULL){
1223 parents[["data_groups"]] <<- parent
1224 },
1225 get_data_groups = function(){
1226 return(verified_element_of_list(parent_list = parents, element_name = "data_groups", object_name = "Data_Object_Densities$parents"))
1227 },
1228 m_load_data = function(){
1229
1230 # Support functions for make_densities()
1231 set_values_of_interest <- function(df_data_groups){
1232 field_value = "value_norm"
1233 field_decoy_group = "group_decoy_input_score"
1234 new_data_groups <- get_data_groups()$df
1235 new_data_groups$value_of_interest <- new_data_groups[,field_value]
1236 new_data_groups$group_of_interest <- new_data_groups[,field_decoy_group]
1237 #groups <- sort(unique(new_data_groups$group_of_interest))
1238
1239 return(new_data_groups)
1240 }
1241 get_ylim <- function(data_groups=NULL){
1242 ylim <- range(data_groups$value_of_interest)
1243
1244 return(ylim)
1245 }
1246 make_hit_density <- function(data_subset=NULL, ylim=NULL){
1247 #stop("Data_Object_Densities$make_hit_density() is untested beyond here")
1248 uniformalize_density <- function(d){
1249 # Reorganizes y-values of density function so that
1250 # function is monotone increasing to mode
1251 # and monotone decreasing afterwards
1252 idx_mode <- which.max(d$y)
1253 idx_lower <- 1:(idx_mode-1)
1254 idx_upper <- idx_mode:length(d$y)
1255
1256 values_lower <- d$y[idx_lower]
1257 values_upper <- d$y[idx_upper]
1258
1259 new_d <- d
1260 new_d$y <- c(sort(values_lower, decreasing = FALSE),
1261 sort(values_upper, decreasing = TRUE))
1262
1263 return(new_d)
1264 }
1265
1266 MIN_PEPTIDE_LENGTH = 11
1267 d <- with(subset(data_subset,
1268 (PMD_FDR_peptide_length >= MIN_PEPTIDE_LENGTH) &
1269 (used_to_find_middle == FALSE)),
1270 density(value_of_interest,
1271 from = ylim[1],
1272 to = ylim[2]))
1273 d <- normalize_density( d)
1274 d <- uniformalize_density(d)
1275
1276 return(d)
1277 }
1278 make_true_hit_density <- function(data_groups=NULL){
1279 #stop("Data_Object_Densities$make_true_hit_density() is untested beyond here")
1280 d_true <- make_hit_density(data_subset = subset(data_groups, (group_training_class == "good_testing") ),
1281 ylim = get_ylim(data_groups))
1282 return(d_true)
1283 }
1284 make_false_hit_density <- function(data_groups=NULL){
1285 d_false <- make_hit_density(data_subset = subset(data_groups, (group_training_class == "bad_long") ),
1286 ylim = get_ylim(data_groups))
1287
1288 return(d_false)
1289 }
1290 add_v_densities <- function(data_groups=NULL, densities=NULL){
1291 #stop("Data_Object_Densities$add_v_densities() is untested beyond here")
1292 groups <- sort(unique(data_groups$group_of_interest))
1293
1294 new_densities <- densities
1295
1296 for (local_group in groups){
1297 d_v <- make_hit_density(data_subset = subset(data_groups, (group_of_interest == local_group)),
1298 ylim = get_ylim(data_groups))
1299 new_densities[local_group] <- d_v$y
1300 }
1301
1302 return(new_densities)
1303 }
1304
1305 # Main section of make_densities()
1306 df_data_groups <- get_data_groups()$df
1307 new_data_groups <- set_values_of_interest(df_data_groups)
1308 d_true <- make_true_hit_density( new_data_groups)
1309 d_false <- make_false_hit_density(new_data_groups)
1310
1311 densities <- data.frame(x=d_true$x,
1312 t=d_true$y,
1313 f=d_false$y)
1314 densities <- add_v_densities(data_groups=new_data_groups, densities=densities)
1315 df <<- densities
1316 }
1317 )
1318 ###############################################################################
1319 # Class: Data_Object_Alpha
1320 ###############################################################################
1321 Data_Object_Alpha <- setRefClass("Data_Object_Alpha",
1322 contains = "Data_Object",
1323 fields =list(df = "data.frame"))
1324 Data_Object_Alpha$methods(
1325 initialize = function(){
1326 callSuper()
1327 class_name <<- "Data_Object_Alpha"
1328 },
1329 verify = function(){
1330 densities <- get_densities()$df
1331
1332 checkTrue(nrow(densities) > 0,
1333 msg = "Densities data.frame was empty (and should not have been)")
1334 },
1335 set_densities = function(parent=NULL){
1336 parents[["densities"]] <<- parent
1337 },
1338 get_densities = function(){
1339 return(verified_element_of_list(parent_list = parents, element_name = "densities", object_name = "Data_Object_Alpha"))
1340 },
1341 m_load_data = function(){
1342
1343 densities <- get_densities()$df
1344
1345 max_of_density = apply(X = densities, MARGIN = 2, FUN = max)
1346 df_alpha <- data.frame(stringsAsFactors = FALSE,
1347 list(v = max_of_density,
1348 group_of_interest = names(max_of_density)))
1349 df_alpha <- subset(df_alpha, group_of_interest != "x")
1350 t <- with(subset(df_alpha, group_of_interest=="t"), v)
1351 f <- with(subset(df_alpha, group_of_interest=="f"), v)
1352 df_alpha$alpha <- with(df_alpha, (t-v)/(t-f))
1353
1354 alpha <- df_alpha[,c("group_of_interest", "alpha")]
1355 alpha <- subset(alpha, (group_of_interest != "t") & (group_of_interest != "f"))
1356
1357 df <<- alpha
1358 }
1359 )
1360 ###############################################################################
1361 # Class: Data_Processor
1362 ###############################################################################
1363 Data_Processor <- setRefClass("Data_Processor",
1364 fields =list(info = "Data_Object_Info",
1365 raw_data = "Data_Object_Raw_Data",
1366 raw_1_percent = "Data_Object_Raw_1_Percent",
1367 data_converter = "Data_Object_Data_Converter",
1368 data_groups = "Data_Object_Groupings",
1369 densities = "Data_Object_Densities",
1370 alpha = "Data_Object_Alpha",
1371 i_fdr = "Data_Object_Individual_FDR"))
1372 Data_Processor$methods(
1373 initialize = function(p_info=NULL){
1374 if (! is.null(p_info)){
1375 set_info(p_info)
1376 }
1377 },
1378 set_info = function(p_info=NULL){
1379 # This initialization defines all of the dependencies between the various components
1380
1381 info <<- p_info
1382
1383 # raw_data
1384 raw_data$set_info(info)
1385 info$append_child(raw_data)
1386
1387 # raw_1_percent
1388 raw_1_percent$set_info(info)
1389 info$append_child(raw_1_percent)
1390
1391 # data_converter
1392 data_converter$set_info (info)
1393 data_converter$set_raw_data(raw_data)
1394 info $append_child (data_converter)
1395 raw_data $append_child (data_converter)
1396
1397 # data_groups
1398 data_groups$set_info (info)
1399 data_groups$set_data_converter(data_converter)
1400 data_groups$set_raw_1_percent (raw_1_percent)
1401 info $append_child (data_groups)
1402 data_converter$append_child (data_groups)
1403 raw_1_percent $append_child (data_groups)
1404
1405 # densities
1406 densities $set_data_groups(data_groups)
1407 data_groups$append_child (densities)
1408
1409 # alpha
1410 alpha $set_densities(densities)
1411 densities$append_child (alpha)
1412
1413 # i_fdr
1414 i_fdr$set_data_groups(data_groups)
1415 i_fdr$set_densities (densities)
1416 i_fdr$set_alpha (alpha)
1417 data_groups $append_child(i_fdr)
1418 densities $append_child(i_fdr)
1419 alpha $append_child(i_fdr)
1420 }
1421 )
1422
1423
1424 #############################################################
1425 ####### Classes for Plotting
1426 #############################################################
1427
1428 ###############################################################################
1429 # Class: Plot_Image
1430 ###############################################################################
1431 Plot_Image = setRefClass("Plot_Image",
1432 fields = list(data_processors = "list",
1433 plot_title = "character",
1434 include_text = "logical",
1435 include_main = "logical",
1436 x.intersp = "numeric",
1437 y.intersp = "numeric",
1438 scale = "numeric",
1439 main = "character",
1440 is_image_container = "logical"))
1441 Plot_Image$methods(
1442 initialize = function(p_data_processors = list(),
1443 p_include_main = TRUE,
1444 p_include_text = TRUE,
1445 p_is_image_container = FALSE){
1446 include_main <<- p_include_main
1447 include_text <<- p_include_text
1448 data_processors <<- p_data_processors
1449 is_image_container <<- p_is_image_container
1450 },
1451 plot_image = function(){
1452 plot(main="Define plot_image() for subclass") # Abstract function
1453 },
1454 get_n = function(){
1455 stop("Need to define function get_n() for subclass") #Abstract function
1456 },
1457 create_standard_main = function(){
1458 needs_main <- function(){
1459 return(include_text & include_main & !is_image_container)
1460 }
1461 if (needs_main()){
1462 collection_name <- data_processors[[1]]$info$collection_name()
1463 main <<- sprintf("%s\n(Dataset: %s; n=%s)", plot_title, collection_name, format(get_n(), big.mark = ","))
1464 }
1465 },
1466 plot_image_in_window = function(p_scale=NULL, window_height=NULL, window_width=NULL){
1467 scale <<- p_scale
1468 SIZE_AXIS <- 2.5 * scale # in the units used by mar
1469 SIZE_MAIN <- 2.5 * scale
1470 SIZE_NO_MARGIN <- 0.1 * scale
1471 FONT_SIZE <- 8 * scale
1472 WINDOW_WIDTH <- window_width * scale
1473 WINDOW_HEIGHT <- window_height * scale
1474 X_INTERSP <- 0.5 * scale + 0.4 # manages legend text spacing
1475 Y_INTERSP <- 0.5 * scale + 0.4 # manages
1476
1477 if (include_main){
1478 mar = c(SIZE_AXIS, SIZE_AXIS, SIZE_MAIN , SIZE_NO_MARGIN)
1479 } else {
1480 mar = c(SIZE_AXIS, SIZE_AXIS, SIZE_NO_MARGIN, SIZE_NO_MARGIN)
1481 }
1482 mgp = c(SIZE_AXIS/2, SIZE_AXIS/4, 0) # Margin line (mex units) for axis title, axis labels, axis lines
1483 ps = FONT_SIZE
1484 x.intersp <<- X_INTERSP
1485 y.intersp <<- Y_INTERSP
1486
1487 windows(width = WINDOW_WIDTH, height=WINDOW_HEIGHT)
1488
1489 old_par <- par(mar=mar, ps=ps, mgp=mgp)
1490 create_standard_main()
1491
1492 plot_image()
1493 if (!is_image_container){
1494 axis(side=1, labels=include_text, tcl=-0.5, lwd=scale)
1495 axis(side=2, labels=include_text, tcl=-0.5, lwd=scale)
1496 box(lwd=scale)
1497 }
1498 par(old_par)
1499 },
1500 plot_image_in_small_window = function(p_scale=1){
1501 plot_image_in_window(p_scale=p_scale, window_height=2, window_width=3.25)
1502 },
1503 plot_image_in_large_window = function(p_scale=1, window_height=NULL){
1504 plot_image_in_window(p_scale=p_scale, window_height=window_height, window_width=7)
1505 }
1506 )
1507 ###############################################################################
1508 # Class: Legend_Object
1509 ###############################################################################
1510 Legend_Object = setRefClass("Legend_Object",
1511 contains = "Plot_Image",
1512 fields = list(user_params = "list",
1513 scale = "numeric"))
1514 Legend_Object$methods(
1515 initialize = function(p_user_params = NULL, p_scale = NULL){
1516 if (is.null(p_user_params)){
1517 user_params <<- list()
1518 } else {
1519 user_params <<- p_user_params
1520 }
1521 if (is.null(p_scale)){
1522 stop("Legend_Object must have a valid scale")
1523 } else {
1524 scale <<- p_scale
1525 }
1526 user_params$x <<- if_null(user_params$x , "topleft", user_params$x)
1527 user_params$y <<- if_null(user_params$y , NULL, user_params$y)
1528 user_params$bty <<- if_null(user_params$bty , "o", user_params$bty)
1529 user_params$lwd <<- if_null(user_params$lwd , NULL, user_params$lwd * scale) # Because we allow NULL, scale must be inside parens
1530 user_params$seg.len <<- if_null(user_params$seg.len , 3, user_params$seg.len ) * scale
1531 user_params$box.lwd <<- if_null(user_params$box.lwd , 1, user_params$box.lwd ) * scale
1532 user_params$x.intersp <<- if_null(user_params$x.intersp, 0.6, user_params$x.intersp) * scale
1533 user_params$y.intersp <<- if_null(user_params$y.intersp, 0.4, user_params$y.intersp) * scale + 0.2
1534 },
1535 show = function(){
1536 first_legend = legend(x = user_params$x,
1537 y = user_params$y,
1538 title = "",
1539 legend = user_params$leg,
1540 col = user_params$col,
1541 bty = user_params$bty,
1542 lty = user_params$lty,
1543 lwd = user_params$lwd,
1544 seg.len = user_params$seg.len,
1545 box.lwd = user_params$box.lwd,
1546 x.intersp = user_params$x.intersp,
1547 y.intersp = user_params$y.intersp)
1548 new_x = first_legend$rect$left
1549 new_y = first_legend$rect$top + first_legend$rect$h * ifelse(scale==1, 0.07, 0.03 - (scale * 0.02)) #switch(scale, 0.01, -0.01, -0.03, -0.05)# (0.07 - 0.09 * ((scale-1)^2))#(0.15 - 0.08*scale)#.07 * (2 - scale)
1550 legend(x=new_x, y=new_y, title = user_params$title, legend = "", cex=1.15, bty="n")
1551
1552 }
1553 )
1554 ###############################################################################
1555 # Class: Plot_Multiple_Images
1556 ###############################################################################
1557 Plot_Multiple_Images = setRefClass("Plot_Multiple_Images",
1558 contains = "Plot_Image",
1559 fields = list(n_images_wide = "numeric",
1560 n_images_tall = "numeric",
1561 image_list = "list"))
1562 Plot_Multiple_Images$methods(
1563 initialize = function(p_n_images_wide=1, p_n_images_tall=2, p_image_list=NULL, ...){
1564 n_images_wide <<- p_n_images_wide
1565 n_images_tall <<- p_n_images_tall
1566 image_list <<- p_image_list
1567 #plot_title <<- "True Hit and False Hit Distributions"
1568
1569 callSuper(p_is_image_container=TRUE, ...)
1570 },
1571 plot_image = function(){
1572 # Support functions
1573 apply_mtext <- function(letter=NULL){
1574 line=1.3*scale
1575 mtext(letter, side=1, line=line, adj=0)
1576 }
1577 # main code
1578 old_par <- par(mfrow=c(n_images_tall, n_images_wide))
1579 i=0
1580 n_images <- length(image_list)
1581
1582 for (i in 1:n_images){
1583 image <- image_list[[i]]
1584 image$create_standard_main()
1585 image$scale <- scale
1586 image$plot_image()
1587 axis(side=1, labels=include_text, tcl=-0.5, lwd=scale)
1588 axis(side=2, labels=include_text, tcl=-0.5, lwd=scale)
1589 box(lwd=scale)
1590 apply_mtext(letter=sprintf("(%s)", letters[i]))
1591
1592 }
1593 par(old_par)
1594
1595 }
1596 )
1597 ###############################################################################
1598 # Class: Plot_Compare_PMD_and_Norm_Density
1599 ###############################################################################
1600 Plot_Compare_PMD_and_Norm_Density = setRefClass("Plot_Compare_PMD_and_Norm_Density",
1601 contains = "Plot_Image",
1602 fields = list(show_norm = "logical",
1603 display_n_psms = "logical"))
1604 Plot_Compare_PMD_and_Norm_Density$methods(
1605 initialize = function(p_show_norm=TRUE, p_display_n_psms=TRUE, ...){
1606 show_norm <<- p_show_norm
1607 display_n_psms <<- p_display_n_psms
1608 plot_title <<- "True Hit and False Hit Distributions"
1609
1610 callSuper(...)
1611 },
1612 plot_image = function(){
1613 # Support functions for plot_compare_PMD_and_norm_density()
1614
1615 get_densities <- function(data_subset = NULL, var_value = NULL){
1616 data_subset$value_of_interest <- data_subset[,var_value]
1617 from <- min(data_subset$value_of_interest)
1618 to <- max(data_subset$value_of_interest)
1619 xlim = range(data_subset$value_of_interest)
1620 data_true <- subset(data_subset, (PMD_FDR_decoy==0) & (PMD_FDR_input_score==100))
1621 data_false <- subset(data_subset, (PMD_FDR_decoy==1))
1622 d_true <- with(data_true , density(value_of_interest, from = from, to = to))
1623 d_false <- with(data_false, density(value_of_interest, from = from, to = to))
1624 d_true <- normalize_density(d_true)
1625 d_false <- normalize_density(d_false)
1626
1627 densities <- list(d_true=d_true, d_false=d_false, var_value = var_value, n_true = nrow(data_true), n_false = nrow(data_false))
1628
1629 return(densities)
1630 }
1631 get_xlim <- function(densities_a = NULL, densities_b = NULL, show_norm=NULL){
1632 xlim <- range(c( densities_a$d_true$x, densities_a$d_false$y))
1633 if (show_norm){
1634 xlim <- range(c(xlim, densities_b$d_true$x, densities_b$d_false$y))
1635 }
1636 return(xlim)
1637 }
1638 get_ylim <- function(densities_a = NULL, densities_b = NULL, show_norm=NULL){
1639 ylim <- range(c( densities_a$d_true$y, densities_a$d_false$y))
1640 if (show_norm){
1641 ylim <- range(c(ylim, densities_b$d_true$y, densities_b$d_false$y))
1642 }
1643 return(ylim)
1644 }
1645 plot_distributions <- function(densities = NULL, var_value= NULL, dataset_name = NULL, ...){
1646 leg = list()
1647 leg$leg = c("Good", "Bad")
1648 if (display_n_psms){
1649 leg$leg = sprintf("%s (%d PSMs)",
1650 leg$leg,
1651 c(densities$n_true, densities$n_false))
1652
1653 }
1654 leg$col = c("black", "red")
1655 leg$lwd = c(3 , 3)
1656 leg$lty = c(1 , 2)
1657 leg$title = "Hit Category"
1658 xlab = ifelse(var_value == "value",
1659 "PMD (ppm)",
1660 "PMD - normalized (ppm)")
1661 ylab = "Density"
1662 if (!include_text){
1663 xlab = ""
1664 ylab = ""
1665 }
1666 plot( densities$d_true , col=leg$col[1], lwd=leg$lwd[1] * scale, lty=leg$lty[1], xaxt = "n", yaxt = "n", main=main, xlab = xlab, ylab=ylab, ...)
1667 lines(densities$d_false, col=leg$col[2], lwd=leg$lwd[2] * scale, lty=leg$lty[2])
1668 abline(v=0, h=0, col="gray", lwd=1*scale)
1669 if (include_text){
1670 legend_object <- Legend_Object$new(leg, scale)
1671 legend_object$show()
1672 #legend("topleft", legend=leg.leg, col=leg.col, lwd=leg.lwd, lty=leg.lty, x.intersp = x.intersp, y.intersp = y.intersp)
1673 }
1674 }
1675
1676 # Main code block for plot_compare_PMD_and_norm_density
1677 data_processor <- data_processors[[1]]
1678 data_processor$data_groups$ensure()
1679 data_groups <- data_processor$data_groups$df
1680
1681 data_subset_a <- subset(data_groups , used_to_find_middle == FALSE)
1682 data_subset_b <- subset(data_subset_a, PMD_FDR_peptide_length > 11)
1683
1684 densities_a <- get_densities(data_subset = data_subset_a, var_value = "value")
1685 densities_b <- get_densities(data_subset = data_subset_b, var_value = "value_norm")
1686
1687 xlim=get_xlim(densities_a, densities_b, show_norm = show_norm)
1688 ylim=get_ylim(densities_a, densities_b, show_norm = show_norm)
1689
1690 dataset_name <- data_processor$info$collection_name
1691 plot_distributions( densities=densities_a, var_value = "value" , dataset_name = dataset_name, xlim=xlim, ylim=ylim)
1692 if (show_norm){
1693 plot_distributions(densities=densities_b, var_value = "value_norm", dataset_name = dataset_name, xlim=xlim, ylim=ylim)
1694 }
1695 },
1696 get_n = function(){
1697 data_processor <- data_processors[[1]]
1698 data_processor$data_groups$ensure()
1699 data_subset_a <- subset(data_processor$data_groups$df , used_to_find_middle == FALSE)
1700 data_subset_b <- subset(data_subset_a, PMD_FDR_peptide_length > 11)
1701
1702 if (show_norm){
1703 data_subset <- data_subset_a
1704 } else {
1705 data_subset <- data_subset_b
1706 }
1707
1708 data_true <- subset(data_subset, (PMD_FDR_decoy==0) & (PMD_FDR_input_score==100))
1709 data_false <- subset(data_subset, (PMD_FDR_decoy==1))
1710
1711 return(nrow(data_true) + nrow(data_false))
1712 }
1713 )
1714
1715 ###############################################################################
1716 # Class: Plot_Time_Invariance_Alt
1717 ###############################################################################
1718 Plot_Time_Invariance_Alt = setRefClass("Plot_Time_Invariance_Alt",
1719 contains = "Plot_Image",
1720 fields = list(show_norm = "logical",
1721 display_n_psms = "logical",
1722 training_class = "character",
1723 ylim = "numeric",
1724 field_of_interest = "character"))
1725 Plot_Time_Invariance_Alt$methods(
1726 initialize = function(p_ylim=NULL, p_training_class=NULL, p_field_of_interest="value_norm", ...){
1727 get_subset_title <- function(training_class=NULL){
1728 if (training_class == "bad_long"){
1729 subset_title="bad only"
1730 } else if (training_class == "good_testing"){
1731 subset_title="good-testing only"
1732 } else if (training_class == "good_training"){
1733 subset_title="good-training only"
1734 } else if (training_class == "other"){
1735 subset_title="other only"
1736 } else {
1737 stop("Unexpected training_class in plot_time_invariance")
1738 }
1739 return(subset_title)
1740 }
1741
1742 ylim <<- p_ylim
1743 training_class <<- p_training_class
1744 field_of_interest <<- p_field_of_interest
1745 subset_title <- get_subset_title(training_class=training_class)
1746 backup_title <- sprintf("Middle 25%% PMD for spectra sorted by index%s",
1747 ifelse(is.null(subset_title),
1748 "",
1749 sprintf(" - %s", subset_title)))
1750 #plot_title <<- get_main(main_title=main, backup_title=backup_title, data_collection = data_collection)
1751 plot_title <<- backup_title
1752
1753 callSuper(...)
1754 },
1755 plot_image = function(){
1756 # Support functions for plot_time_invariance()
1757
1758 # Main code of plot_time_invariance()
1759 data_subset = get_data_subset()
1760 plot_group_spectrum_index_from_subset_boxes(data_subset = data_subset)
1761 abline(h=0, col="blue", lwd=scale)
1762 },
1763 get_data_subset = function(){
1764 data_processor <- data_processors[[1]]
1765 data_processor$data_groups$ensure()
1766 return(subset(data_processor$data_groups$df, (group_training_class==training_class)))
1767 },
1768 get_n = function(){
1769 return(nrow(get_data_subset()))
1770 },
1771 plot_group_spectrum_index_from_subset_boxes = function(data_subset = NULL){
1772 n_plot_groups <- 100
1773
1774 field_name_text <- ifelse(field_of_interest=="value", "PMD", "Translated PMD")
1775 new_subset <- data_subset
1776 new_subset$value_of_interest <- new_subset[,field_of_interest]
1777 new_subset <- new_subset[order(new_subset$PMD_FDR_spectrum_index),]
1778
1779 idxs <- round_to_tolerance(seq(from=1, to=nrow(new_subset), length.out = n_plot_groups+1), 1)
1780 idxs_left <- idxs[-(n_plot_groups+1)]
1781 idxs_right <- idxs[-1] - 1
1782 idxs_right[n_plot_groups] <- idxs_right[n_plot_groups] + 1
1783
1784 new_subset$plot_group <- NA
1785 for (i in 1:n_plot_groups){
1786 new_subset$plot_group[idxs_left[i]:idxs_right[i]] <- i
1787 }
1788 xleft <- aggregate(PMD_FDR_spectrum_index ~plot_group, data=new_subset, FUN=min)
1789 xright <- aggregate(PMD_FDR_spectrum_index ~plot_group, data=new_subset, FUN=max)
1790 ybottom <- aggregate(value_of_interest~plot_group, data=new_subset, FUN=function(x){quantile(x, probs = 0.5 - (0.25/2))})
1791 ytop <- aggregate(value_of_interest~plot_group, data=new_subset, FUN=function(x){quantile(x, probs = 0.5 + (0.25/2))})
1792 boxes <- merge( rename_column(xleft , "PMD_FDR_spectrum_index" , "xleft"),
1793 merge( rename_column(xright , "PMD_FDR_spectrum_index" , "xright"),
1794 merge(rename_column(ybottom, "value_of_interest", "ybottom"),
1795 rename_column(ytop , "value_of_interest", "ytop"))))
1796
1797 xlab <- "Spectrum Index"
1798 ylab <- sprintf("%s (ppm)", field_name_text )
1799 if (is.null(ylim)){
1800 ylim <<- range(new_subset$value_of_interest)
1801 }
1802 if (!include_text){
1803 xlab=""
1804 ylab=""
1805 }
1806 plot(value_of_interest~PMD_FDR_spectrum_index, data=new_subset, type="n", ylim=ylim, xlab = xlab, ylab=ylab, main=main, xaxt="n", yaxt="n")
1807 with(boxes, rect(xleft = xleft, ybottom = ybottom, xright = xright, ytop = ytop, lwd=scale))
1808 #points(median_of_group_index~PMD_FDR_spectrum_index, data=data_subset, cex=.5, pch=15)
1809 axis(1, labels=include_text, lwd=scale)
1810 axis(2, labels=include_text, lwd=scale)
1811 box(lwd=scale) #box around plot area
1812 }
1813
1814 )
1815 ###############################################################################
1816 # Class: Plot_Time_Invariance_Alt_Before_and_After
1817 ###############################################################################
1818 Plot_Time_Invariance_Alt_Before_and_After = setRefClass("Plot_Time_Invariance_Alt_Before_and_After",
1819 contains = "Plot_Multiple_Images",
1820 fields = list())
1821 Plot_Time_Invariance_Alt_Before_and_After$methods(
1822 initialize = function(p_data_processors = NULL,
1823 p_include_text=TRUE,
1824 p_include_main=FALSE,
1825 p_ylim = c(-4,4), ...){
1826 plot_object1 <- Plot_Time_Invariance_Alt$new(p_data_processors = p_data_processors,
1827 p_include_text=p_include_text,
1828 p_include_main=p_include_main,
1829 p_training_class = "good_testing",
1830 p_field_of_interest = "value",
1831 p_ylim = p_ylim)
1832
1833 plot_object2 <- Plot_Time_Invariance_Alt$new(p_data_processors = p_data_processors,
1834 p_include_text=p_include_text,
1835 p_include_main=p_include_main,
1836 p_training_class = "good_testing",
1837 p_field_of_interest = "value_norm",
1838 p_ylim = p_ylim)
1839
1840 callSuper(p_n_images_wide=1,
1841 p_n_images_tall=2,
1842 p_include_text=p_include_text,
1843 p_include_main=p_include_main,
1844 p_image_list = list(plot_object1, plot_object2), ...)
1845 }
1846 )
1847
1848 ###############################################################################
1849 # Class: Plot_Density_PMD_and_Norm_Decoy_by_AA_Length
1850 ###############################################################################
1851 Plot_Density_PMD_and_Norm_Decoy_by_AA_Length = setRefClass("Plot_Density_PMD_and_Norm_Decoy_by_AA_Length",
1852 contains = "Plot_Image",
1853 fields = list(show_norm = "logical"))
1854 Plot_Density_PMD_and_Norm_Decoy_by_AA_Length$methods(
1855 initialize = function(p_show_norm=FALSE, ...){
1856 plot_title <<- "The Decoy Bump: PMD Distribution of Decoy matches by peptide length"
1857 show_norm <<- p_show_norm
1858 callSuper(...)
1859 },
1860 get_n = function(){
1861 data_processor <- data_processors[[1]]
1862 data_processor$data_groups$ensure()
1863 data_subset <- subset(data_processor$data_groups$df, (PMD_FDR_decoy == 1))
1864 return(nrow(data_subset))
1865 },
1866 plot_image = function(){
1867
1868 # Support functions for plot_density_PMD_and_norm_decoy_by_aa_length()
1869
1870 add_group_peptide_length_special <- function(){
1871 data_processor <- data_processors[[1]]
1872 data_processor$data_groups$ensure()
1873 data_groups <- data_processor$data_groups$df # the name data_groups is a data.frame instead of a Data_Object
1874 data_groups <- subset(data_groups, used_to_find_middle == FALSE)
1875
1876 df_group_definition <- data.frame(stringsAsFactors = FALSE,
1877 list(group_peptide_length_special = c("06-08", "09-10", "11-12", "13-15", "16-20", "21-50"),
1878 min = c( 6 , 9 , 11 , 13 , 16 , 21 ),
1879 max = c( 8 , 10 , 12 , 15 , 20 , 50 ) ))
1880 group_peptide_length_special <- data.frame(list(PMD_FDR_peptide_length = 6:50))
1881 group_peptide_length_special$min <- with(group_peptide_length_special, sapply(PMD_FDR_peptide_length, FUN = function(i) max(df_group_definition$min[df_group_definition$min <= i])))
1882 group_peptide_length_special <- merge(group_peptide_length_special, df_group_definition)
1883
1884 data_groups$group_peptide_length_special <- NULL
1885 new_data_groups <- (merge(data_groups,
1886 group_peptide_length_special[,c("PMD_FDR_peptide_length",
1887 "group_peptide_length_special")]))
1888 return(new_data_groups)
1889 }
1890 get_densities <- function(data_subset = NULL, field_value = NULL, field_group=NULL){
1891 get_density_from_subset <- function(data_subset=NULL, xlim=NULL){
1892
1893 d_group <- with(data_subset , density(value_of_interest, from = xlim[1], to = xlim[2]))
1894 d_group <- normalize_density(d_group)
1895
1896 return(d_group)
1897 }
1898
1899 data_temp <- data_subset
1900 data_temp$value_of_interest <- data_temp[[field_value]]
1901 data_temp$group_of_interest <- data_temp[[field_group]]
1902
1903 xlim = range(data_temp$value_of_interest)
1904
1905 groups <- sort(unique(data_temp$group_of_interest))
1906 n_groups <- length(groups)
1907
1908 d_group <- get_density_from_subset( data_subset=data_temp, xlim = xlim )
1909 densities <- list("All decoys" = d_group)
1910 for (i in 1:n_groups){
1911 group <- groups[i]
1912
1913 d_group <- get_density_from_subset( data_subset=subset(data_temp, (group_of_interest == group)),
1914 xlim = xlim )
1915 densities[[group]] <- d_group
1916 }
1917
1918 return(densities)
1919 }
1920 get_limits <- function(densities_a = NULL, densities_b = NULL){
1921 xlim = c()
1922 ylim = c(0)
1923 for (single_density in densities_a){
1924 xlim=range(c(xlim, single_density$x))
1925 ylim=range(c(ylim, single_density$y))
1926 }
1927 for (single_density in densities_b){
1928 xlim=range(c(xlim, single_density$x))
1929 ylim=range(c(ylim, single_density$y))
1930 }
1931
1932 return(list(xlim=xlim, ylim=ylim))
1933 }
1934 plot_distributions <- function(data_groups = NULL, xlim=NULL, ylim=NULL, densities = NULL, field_group= NULL, field_value = "value", xlab_modifier = "", var_value= NULL, include_peak_dots=TRUE, dataset_name = NULL, ...){
1935 data_groups$group_of_interest <- data_groups[[field_group]]
1936 data_groups$value_of_interest <- data_groups[[field_value]]
1937
1938 # Main body of plot_decoy_distribution_by_field_of_interest()
1939 FIXED_LWD=3
1940
1941 groups <- sort(unique(data_groups$group_of_interest))
1942 n <- length(groups)
1943
1944 df_leg <- data.frame(stringsAsFactors = FALSE,
1945 list(leg = groups,
1946 col = rainbow_with_fixed_intensity(n = n, goal_intensity_0_1 = 0.4),
1947 lty = rep(1:6, length.out=n),
1948 lwd = rep(FIXED_LWD , n)) )
1949
1950 d <- densities[["All decoys"]]
1951
1952 xlab = sprintf("Precursor Mass Discrepancy%s (ppm)", xlab_modifier)
1953 ylab = "Density"
1954
1955 if (!include_text){
1956 xlab=""
1957 ylab=""
1958 }
1959 plot(d, lwd=FIXED_LWD * scale, main=main, xlab=xlab, ylab=ylab, xlim=xlim, ylim=ylim, xaxt="n", yaxt="n")
1960
1961 ave_peak <- max(d$y)
1962 max_peak <- 0
1963
1964 for (local_group in groups){
1965 data_subset <- subset(data_groups, group_of_interest == local_group)
1966 data_info <- subset(df_leg , leg == local_group)
1967 col <- data_info$col[1]
1968 lty <- data_info$lty[1]
1969 lwd <- data_info$lwd[1]
1970 if (nrow(data_subset) > 100){
1971 d <- densities[[local_group]] #density(data_subset[[field_value]])
1972 lines(d, col=col, lty=lty, lwd=lwd * scale)
1973 peak <- max(d$y)
1974 max_peak <- max(max_peak, peak)
1975 }
1976 }
1977 abline(v=0, h=0, lwd=scale)
1978 leg <- list(title = "Peptide length (aa)",
1979 leg = c("All decoys" , df_leg$leg),
1980 col = c(col2hex("black") , df_leg$col),
1981 lty = c(1 , df_leg$lty),
1982 lwd = c(FIXED_LWD , df_leg$lwd)
1983 )
1984 if (include_text){
1985 legend_object = Legend_Object$new(leg, scale)
1986 legend_object$show()
1987 #first_legend = legend(x="topleft", title = "", legend = leg$leg, col = leg$col, lty = leg$lty, lwd = leg$lwd, seg.len=leg$seg.len, box.lwd=leg$box.lwd, x.intersp = leg$x.intersp, y.intersp = leg$y.intersp)
1988 #new_x = first_legend$rect$left
1989 #new_y = first_legend$rect$top + first_legend$rect$h * .07 * (2 - scale)
1990 #legend(x=new_x, y=new_y, title = leg$title, legend = "", cex=1.15, bty="n")
1991 }
1992
1993 box(lwd=scale) #box around plot area
1994
1995 }
1996
1997 # Main body for plot_density_PMD_and_norm_decoy_by_aa_length()
1998
1999 data_mod <- add_group_peptide_length_special()
2000 data_mod <- subset(data_mod, PMD_FDR_decoy==1)
2001
2002 densities_a <- get_densities(data_subset = data_mod, field_value = "value" , field_group = "group_peptide_length_special")
2003 densities_b <- get_densities(data_subset = data_mod, field_value = "value_norm", field_group = "group_peptide_length_special")
2004
2005 data_processor <- data_processors[[1]]
2006 dataset_name <- data_processor$info$collection_name()
2007
2008 limits <- get_limits(densities_a, densities_b)
2009 xlim <- limits$xlim
2010 ylim <- limits$ylim
2011
2012 if (show_norm){
2013 plot_distributions(data_groups = data_mod, densities=densities_b, field_value = "value_norm", xlab_modifier = " - normalized", field_group = "group_peptide_length_special", dataset_name=dataset_name, xlim=xlim, ylim=ylim)
2014 } else {
2015 plot_distributions(data_groups = data_mod, densities=densities_a, field_value = "value" , xlab_modifier = "" , field_group = "group_peptide_length_special", dataset_name=dataset_name, xlim=xlim, ylim=ylim)
2016 }
2017 }
2018
2019 )
2020
2021 ###############################################################################
2022 # Class: Plot_Bad_CI
2023 ###############################################################################
2024 Plot_Bad_CI = setRefClass("Plot_Bad_CI",
2025 contains = "Plot_Image",
2026 fields = list(breaks = "numeric",
2027 ylim = "numeric"))
2028 Plot_Bad_CI$methods(
2029 initialize = function(p_breaks=20, p_ylim=NULL, ...){
2030 if (is.null(p_ylim)){
2031 ylim <<- numeric(0)
2032 } else {
2033 ylim <<- p_ylim
2034 }
2035 breaks <<- p_breaks
2036 plot_title <<- "Credible Intervals for proportion within range - bad"
2037 callSuper(...)
2038 },
2039 data_processor = function(){
2040 return(data_processors[[1]])
2041 },
2042 get_n = function(){
2043 data_processor()$data_groups$ensure()
2044 return(nrow(subset(data_processor()$data_groups$df, (PMD_FDR_decoy == 1))))
2045 },
2046 plot_image = function(){
2047 data_processor()$data_groups$ensure()
2048 data_groups <- data_processor()$data_groups$df
2049 data_decoy <- subset(data_groups, data_groups$group_training_class == "bad_long")
2050 data_decoy$region <- cut(x = data_decoy$value, breaks = breaks)
2051 table(data_decoy$region)
2052 regions <- unique(data_decoy$region)
2053
2054 N = nrow(data_decoy)
2055 find_lower_ci_bound <- function(x){
2056 ci <- credible_interval(length(x), N, precision = 0.001, alpha=0.05)
2057 return(ci[1])
2058 }
2059 find_upper_ci_bound <- function(x){
2060 ci <- credible_interval(length(x), N, precision = 0.001, alpha=0.05)
2061 return(ci[2])
2062 }
2063 xleft <- aggregate(value~region, data=data_decoy, FUN=min)
2064 xright <- aggregate(value~region, data=data_decoy, FUN=max)
2065 ytop <- aggregate(value~region, data=data_decoy, FUN=find_upper_ci_bound)
2066 ybottom <- aggregate(value~region, data=data_decoy, FUN=find_lower_ci_bound)
2067
2068 xleft <- rename_column(xleft , "value", "xleft" )
2069 xright <- rename_column(xright , "value", "xright" )
2070 ytop <- rename_column(ytop , "value", "ytop" )
2071 ybottom <- rename_column(ybottom, "value", "ybottom")
2072
2073 boxes <- merge(merge(xleft, xright), merge(ytop, ybottom))
2074
2075
2076 xlab <- "Precursor Mass Discrepancy (ppm)"
2077 ylab <- "Proportion of PSMs\nin subgroup"
2078 xlim=range(data_decoy$value)
2079 get_ylim(boxes=boxes)
2080 if (!include_text){
2081 xlab=""
2082 ylab=""
2083 }
2084
2085 plot(x=c(-10,10), y=c(0,1), type="n", ylim=ylim, xlim=xlim, xlab=xlab, ylab=ylab, main=main, xaxt="n", yaxt="n")
2086
2087 with(boxes, rect(xleft=xleft, xright=xright, ytop=ytop, ybottom=ybottom, lwd=scale))
2088
2089 abline(h=1/breaks, col="blue", lwd=scale)
2090 },
2091 get_ylim = function(boxes=NULL){
2092 is_valid_range <- function(r=NULL){
2093 return(length(r) == 2)
2094 }
2095 if (! is_valid_range(ylim)){
2096 ylim <<- range(c(0,boxes$ytop, boxes$ybottom))
2097 }
2098 }
2099
2100 )
2101 ###############################################################################
2102 # Class: Plot_Selective_Loss
2103 ###############################################################################
2104 Plot_Selective_Loss = setRefClass("Plot_Selective_Loss",
2105 contains = "Plot_Image",
2106 fields = list())
2107 Plot_Selective_Loss$methods(
2108 initialize = function( ...){
2109 plot_title <<- "PMD-FDR Selectively removes Bad Hits"
2110 callSuper(...)
2111 },
2112 data_processor = function(){
2113 return(data_processors[[1]])
2114 },
2115 get_n = function(){
2116 data_processor()$i_fdr$ensure()
2117 data_subset <- data_processor()$i_fdr$df
2118 return(nrow(data_subset))
2119 },
2120 plot_image = function(){
2121 # Support functions for plot_selective_loss()
2122
2123 samples_lost_by_threshold <- function(updated_i_fdr=NULL, score_threshold=NULL){
2124 data_subset <- subset(updated_i_fdr, PMD_FDR_input_score >= score_threshold)
2125 tbl <- with(updated_i_fdr,
2126 table(PMD_FDR_input_score >= score_threshold,
2127 new_confidence < score_threshold,
2128 group_decoy_proteins))
2129 df <- data.frame(tbl)
2130 df_n <- aggregate(Freq~group_decoy_proteins+Var1, data=df, FUN=sum)
2131 df_n <- rename_column(df_n, name_before = "Freq", "n")
2132 df <- merge(df, df_n)
2133 df$rate_of_loss <- with(df, Freq/n)
2134 df <- subset(df, (Var1==TRUE) & (Var2==TRUE))
2135 df <- df[,c("group_decoy_proteins", "rate_of_loss", "n", "Freq")]
2136 if (nrow(df) > 0){
2137 df$score_threshold <- score_threshold
2138 }
2139 return(df)
2140 }
2141 get_loss_record <- function(updated_i_fdr=NULL, score_thresholds=NULL){
2142 df=data.frame()
2143 for (score_threshold in score_thresholds){
2144 df_new_loss <- samples_lost_by_threshold(updated_i_fdr, score_threshold)
2145 df <- rbind(df, df_new_loss)
2146 }
2147 return(df)
2148 }
2149
2150 # Main code for plot_selective_loss()
2151
2152 updated_i_fdr <- data_processor()$i_fdr$df
2153 updated_i_fdr$new_confidence <- with(updated_i_fdr, 100 * (1-i_fdr)) #ifelse((1-i_fdr) < (PMD_FDR_input_score / 100), (1-i_fdr), (PMD_FDR_input_score/100)))
2154 loss_record <- get_loss_record(updated_i_fdr=updated_i_fdr, score_thresholds = 1:100)
2155 xlim <- with(loss_record, range(score_threshold))
2156 ylim <- c(0,1)
2157 xlab <- "Fixed Confidence threshold (PeptideShaker score)"
2158 ylab <- "Rate of PSM disqualification from PMD-FDR"
2159 lwd <- 4
2160 plot(x=xlim, y=ylim, type="n", main=main, xlab=xlab, ylab=ylab)
2161
2162 groups <- sort(unique(loss_record$group_decoy_proteins))
2163 n_g <- length(groups)
2164
2165 cols <- rainbow_with_fixed_intensity(n=n_g, goal_intensity_0_1 = 0.5, alpha = 1)
2166 ltys <- rep(1:6, length.out=n_g)
2167
2168 leg <- list(leg=groups, col=cols, lty=ltys, lwd=lwd, title="Species/Category")
2169
2170 for (i in 1:n_g){
2171 lines(rate_of_loss~score_threshold, data=subset(loss_record, group_decoy_proteins==leg$leg[i]), col=leg$col[i], lwd=leg$lwd * scale, lty=leg$lty[i])
2172 }
2173 abline(h=0, v=100, lwd=scale)
2174 abline(h=c(0.1, 0.8), col="gray", lwd=scale)
2175
2176 #leg = list(leg=group, col=col, lty=lty, lwd=lwd)
2177 #with(leg, legend(x = "topleft", legend = group, col = col, lty = lty, lwd = lwd, seg.len = seg.len))
2178 legend_object <- Legend_Object$new(leg, scale)
2179 legend_object$show()
2180 }
2181
2182 )
2183 ###############################################################################
2184 # Class: Plot_Selective_Loss_for_TOC
2185 ###############################################################################
2186 Plot_Selective_Loss_for_TOC = setRefClass("Plot_Selective_Loss_for_TOC",
2187 contains = "Plot_Image",
2188 fields = list(xlab="character",
2189 ylab="character",
2190 title_x="numeric",
2191 title_y="numeric",
2192 legend_border="logical",
2193 legend_x = "numeric",
2194 legend_y = "numeric",
2195 legend_title="character",
2196 legend_location = "character",
2197 name_contaminant = "character",
2198 name_decoy = "character",
2199 name_human = "character",
2200 name_pyro = "character"))
2201 Plot_Selective_Loss_for_TOC$methods(
2202 initialize = function( ...){
2203 plot_title <<- "PMD-FDR selectively removes bad hits"
2204 callSuper(...)
2205 xlab <<- "Confidence threshold (PeptideShaker)"
2206 ylab <<- "PMD Disqualifiction Rate"
2207 legend_border <<- FALSE
2208 #legend_title <<- "Species/Category"
2209 title_x <<- 50
2210 title_y <<- 0.9
2211 legend_x <<- 10
2212 legend_y <<- 0.75
2213 name_contaminant <<- "signal - contaminant"
2214 name_decoy <<- "decoy - reversed"
2215 name_human <<- "decoy - human"
2216 name_pyro <<- "signal - pyrococcus"
2217 },
2218 data_processor = function(){
2219 return(data_processors[[1]])
2220 },
2221 get_n = function(){
2222 data_processor()$i_fdr$ensure()
2223 data_subset <- data_processor()$i_fdr$df
2224 return(nrow(data_subset))
2225 },
2226 plot_image = function(){
2227 # Support functions for plot_selective_loss()
2228
2229 samples_lost_by_threshold <- function(updated_i_fdr=NULL, score_threshold=NULL){
2230 data_subset <- subset(updated_i_fdr, PMD_FDR_input_score >= score_threshold)
2231 tbl <- with(updated_i_fdr,
2232 table(PMD_FDR_input_score >= score_threshold,
2233 new_confidence < score_threshold,
2234 group_decoy_proteins))
2235 df <- data.frame(tbl)
2236 df_n <- aggregate(Freq~group_decoy_proteins+Var1, data=df, FUN=sum)
2237 df_n <- rename_column(df_n, name_before = "Freq", "n")
2238 df <- merge(df, df_n)
2239 df$rate_of_loss <- with(df, Freq/n)
2240 df <- subset(df, (Var1==TRUE) & (Var2==TRUE))
2241 df <- df[,c("group_decoy_proteins", "rate_of_loss", "n", "Freq")]
2242 if (nrow(df) > 0){
2243 df$score_threshold <- score_threshold
2244 }
2245 return(df)
2246 }
2247 get_loss_record <- function(updated_i_fdr=NULL, score_thresholds=NULL){
2248 df=data.frame()
2249 for (score_threshold in score_thresholds){
2250 df_new_loss <- samples_lost_by_threshold(updated_i_fdr, score_threshold)
2251 df <- rbind(df, df_new_loss)
2252 }
2253 return(df)
2254 }
2255 convert_groups <- function(groups=NULL){
2256 new_groups <- groups
2257 new_groups <- gsub(pattern = "contaminant", replacement = name_contaminant, x = new_groups)
2258 new_groups <- gsub(pattern = "decoy" , replacement = name_decoy , x = new_groups)
2259 new_groups <- gsub(pattern = "human" , replacement = name_human , x = new_groups)
2260 new_groups <- gsub(pattern = "pyrococcus" , replacement = name_pyro , x = new_groups)
2261
2262 return(new_groups)
2263 }
2264
2265 # Main code for plot_selective_loss()
2266
2267 updated_i_fdr <- data_processor()$i_fdr$df
2268 updated_i_fdr$new_confidence <- with(updated_i_fdr, 100 * (1-i_fdr)) #ifelse((1-i_fdr) < (PMD_FDR_input_score / 100), (1-i_fdr), (PMD_FDR_input_score/100)))
2269 loss_record <- get_loss_record(updated_i_fdr=updated_i_fdr, score_thresholds = 1:100)
2270 xlim <- with(loss_record, range(score_threshold))
2271 ylim <- c(0,1)
2272 #xlab <- "Fixed Confidence threshold (PeptideShaker score)"
2273 #ylab <- "Rate of PSM disqualification from PMD-FDR"
2274 lwd <- 4
2275 plot(x=xlim, y=ylim, type="n", main=main, xlab=xlab, ylab=ylab)
2276
2277 groups <- sort(unique(loss_record$group_decoy_proteins))
2278 n_g <- length(groups)
2279
2280 cols <- rainbow_with_fixed_intensity(n=n_g, goal_intensity_0_1 = 0.5, alpha = 1)
2281 ltys <- rep(1:6, length.out=n_g)
2282 bty <- ifelse(legend_border, "o", "n")
2283
2284 leg <- list(leg=convert_groups(groups), var_name=groups, col=cols, lty=ltys, lwd=lwd, bty=bty, x=legend_x, y=legend_y)
2285 #leg <- list(leg=groups, col=cols, lty=ltys, lwd=lwd, bty=bty, x=legend_x, y=legend_y)
2286
2287 for (i in 1:n_g){
2288 lines(rate_of_loss~score_threshold, data=subset(loss_record, group_decoy_proteins==leg$var_name[i]), col=leg$col[i], lwd=leg$lwd * scale, lty=leg$lty[i])
2289 }
2290 abline(h=0, v=100, lwd=scale)
2291 abline(h=c(0.1, 0.8), col="gray", lwd=scale)
2292
2293 #leg = list(leg=group, col=col, lty=lty, lwd=lwd)
2294 #with(leg, legend(x = "topleft", legend = group, col = col, lty = lty, lwd = lwd, seg.len = seg.len))
2295 legend_object <- Legend_Object$new(leg, scale)
2296 legend_object$show()
2297 text(x=title_x, y=title_y, labels = plot_title)
2298 }
2299
2300 )
2301 ###############################################################################
2302 # Class: Plot_Compare_iFDR_Confidence_1_Percent_TD_FDR
2303 ###############################################################################
2304 Plot_Compare_iFDR_Confidence_1_Percent_TD_FDR = setRefClass("Plot_Compare_iFDR_Confidence_1_Percent_TD_FDR",
2305 contains = "Plot_Image",
2306 fields = list())
2307 Plot_Compare_iFDR_Confidence_1_Percent_TD_FDR$methods(
2308 initialize = function( ...){
2309 plot_title <<- "Precursor Mass Discrepance i-FDR for 1% Target-Decoy FDR PSMs"
2310 callSuper(...)
2311 },
2312 data_processor = function(){
2313 return(data_processors[[1]])
2314 },
2315 get_n = function(){
2316 data_processor()$i_fdr$ensure()
2317 if (one_percent_calculation_exists()){
2318 i_fdr <- data_processor()$i_fdr$df
2319 data_subset <- subset(i_fdr, is_in_1percent==TRUE)
2320 n <- nrow(data_subset)
2321 } else {
2322 n <- 0
2323 }
2324
2325 return (n)
2326 },
2327 plot_image = function(){
2328 if (one_percent_calculation_exists()){
2329 i_fdr <- get_modified_fdr()
2330 report_good_discrepancies(i_fdr)
2331 data_TD_good <- get_data_TD_good(i_fdr)
2332 mean_results <- get_mean_results(data_TD_good)
2333 boxes <- mean_results
2334 boxes <- rename_columns(df = boxes,
2335 names_before = c("min_conf", "max_conf", "lower" , "upper"),
2336 names_after = c("xleft" , "xright" , "ybottom", "ytop" ))
2337 xlim <- range(boxes[,c("xleft", "xright")])
2338 ylim <- range(boxes[,c("ybottom", "ytop")])
2339
2340 #head(mean_results)
2341
2342 xlab = "Confidence Score (Peptide Shaker)"
2343 ylab = "Mean PMD i-FDR"
2344
2345 if (!include_text){
2346 xlab=""
2347 ylab=""
2348 }
2349
2350 plot(mean_i_fdr~mean_conf, data=mean_results, xlim=xlim, ylim=ylim, xlab=xlab, ylab=ylab, main=main, xaxt="n", yaxt="n", cex=scale, lwd=scale)
2351 with(boxes, rect(xleft = xleft, ybottom = ybottom, xright = xright, ytop = ytop, lwd=scale))
2352 abline(b=-1, a=100, lwd=4*scale, col="dark gray")
2353 abline(h=0, v=100, lwd=1*scale)
2354
2355 } else {
2356 stop(sprintf("Dataset '%s' does not include 1%% FDR data", data_processor()$info$collection_name()))
2357 }
2358 },
2359 get_mean_results = function(data_TD_good = NULL){
2360 mean_i_fdr <- aggregate(i_fdr~conf_group, data=data_TD_good, FUN=mean)
2361 mean_i_fdr <- rename_column(mean_i_fdr, "i_fdr", "mean_i_fdr")
2362 sd_i_fdr <- aggregate(i_fdr~conf_group, data=data_TD_good, FUN=sd)
2363 sd_i_fdr <- rename_column(sd_i_fdr, "i_fdr", "sd_i_fdr")
2364 n_i_fdr <- aggregate(i_fdr~conf_group, data=data_TD_good, FUN=length)
2365 n_i_fdr <- rename_column(n_i_fdr, "i_fdr", "n")
2366 mean_conf <- aggregate(PMD_FDR_input_score~conf_group, data=data_TD_good, FUN=mean)
2367 mean_conf <- rename_column(mean_conf, "PMD_FDR_input_score", "mean_conf")
2368 min_conf <- aggregate(PMD_FDR_input_score~conf_group, data=data_TD_good, FUN=min)
2369 min_conf <- rename_column(min_conf, "PMD_FDR_input_score", "min_conf")
2370 max_conf <- aggregate(PMD_FDR_input_score~conf_group, data=data_TD_good, FUN=max)
2371 max_conf <- rename_column(max_conf, "PMD_FDR_input_score", "max_conf")
2372
2373 mean_results <- mean_i_fdr
2374 mean_results <- merge(mean_results, sd_i_fdr)
2375 mean_results <- merge(mean_results, n_i_fdr)
2376 mean_results <- merge(mean_results, mean_conf)
2377 mean_results <- merge(mean_results, min_conf)
2378 mean_results <- merge(mean_results, max_conf)
2379
2380 mean_results$se <- with(mean_results, sd_i_fdr / sqrt(n - 1))
2381 mean_results$lower <- with(mean_results, mean_i_fdr - 2*se)
2382 mean_results$upper <- with(mean_results, mean_i_fdr + 2*se)
2383 return(mean_results)
2384 },
2385 get_data_TD_good = function(i_fdr=NULL){
2386 data_TD_good <- subset(i_fdr, TD_good==TRUE)
2387 data_TD_good <- data_TD_good[order(data_TD_good$PMD_FDR_input_score),]
2388 n <- nrow(data_TD_good)
2389 data_TD_good$conf_group <- cut(1:n, breaks=floor(n/100))
2390 data_TD_good$i_fdr <- 100 * data_TD_good$i_fdr
2391 return(data_TD_good)
2392 },
2393 get_modified_fdr = function(){
2394 i_fdr <- data_processor()$i_fdr$df
2395 i_fdr$PMD_good <- i_fdr$i_fdr < 0.01
2396 i_fdr$TD_good <- i_fdr$is_in_1percent == TRUE
2397 i_fdr$conf_good <- i_fdr$PMD_FDR_input_score == 100
2398 return(i_fdr)
2399 },
2400 one_percent_calculation_exists = function(){
2401 data_processor()$raw_1_percent$ensure()
2402 return(data_processor()$raw_1_percent$exists())# "is_in_1percent" %in% colnames(data_processor()$i_fdr))
2403 },
2404 report_good_discrepancies = function(i_fdr=NULL){
2405 with(subset(i_fdr, (PMD_FDR_decoy == 0)), print(table(TD_good, PMD_good)))
2406 with(subset(i_fdr, (PMD_FDR_input_score==100) & (PMD_FDR_decoy == 0)), print(table(TD_good, PMD_good)))
2407 with(subset(i_fdr, (PMD_FDR_input_score>= 99) & (PMD_FDR_input_score<100) & (PMD_FDR_decoy == 0)), print(table(TD_good, PMD_good)))
2408 with(subset(i_fdr, (PMD_FDR_input_score>= 99) & (PMD_FDR_input_score<100) & (PMD_FDR_decoy == 0)), print(table(TD_good, PMD_good)))
2409 with(subset(i_fdr, (PMD_FDR_input_score>= 90) & (PMD_FDR_input_score< 99) & (PMD_FDR_decoy == 0)), print(table(TD_good, PMD_good)))
2410 }
2411
2412 )
2413
2414 ###############################################################################
2415 # Class: Plot_Density_PMD_by_Score
2416 ###############################################################################
2417 Plot_Density_PMD_by_Score = setRefClass("Plot_Density_PMD_by_Score",
2418 contains = "Plot_Image",
2419 fields = list(show_norm = "logical"))
2420 Plot_Density_PMD_by_Score$methods(
2421 initialize = function(p_show_norm=FALSE, ...){
2422 show_norm <<- p_show_norm
2423 plot_title <<- "PMD distribution, by Confidence ranges"
2424 callSuper(...)
2425
2426 },
2427 data_processor = function(){
2428 return(data_processors[[1]])
2429 },
2430 get_n = function(){
2431 return(nrow(data_processor()$data_groups$df))
2432 #data_subset <- data_collection$i_fdr
2433 #return(nrow(data_subset))
2434 },
2435 get_modified_data_groups = function(var_value = NULL){
2436 # Note: Filters out used_to_find_middle
2437 # Note: Creates "value_of_interest" field
2438 # Note: Remakes "group_decoy_input_score" field
2439 data_new <- data_processor()$data_groups$df
2440 data_new <- subset(data_new, !used_to_find_middle )
2441 data_new$value_of_interest <- data_new[, var_value]
2442
2443 cutoff_points <- c(100, 100, 95, 80, 50, 0, 0)
2444 n <- length(cutoff_points)
2445 uppers <- cutoff_points[-n]
2446 lowers <- cutoff_points[-1]
2447
2448 for (i in 1:(n-1)){
2449 upper <- uppers[i]
2450 lower <- lowers[i]
2451
2452
2453 if (lower==upper){
2454 idx <- with(data_new, which( (PMD_FDR_input_score == upper) & (PMD_FDR_decoy == 0)))
2455 cat_name <- sprintf("%d", upper)
2456 } else {
2457 idx <- with(data_new, which((PMD_FDR_input_score >= lower) & (PMD_FDR_input_score < upper) & (PMD_FDR_decoy == 0)))
2458 cat_name <- sprintf("%02d - %2d", lower, upper)
2459 }
2460 data_new$group_decoy_input_score[idx] <- cat_name
2461 }
2462
2463 return(data_new)
2464 },
2465 plot_image = function(){
2466
2467 # Support functions for plot_density_PMD_by_score()
2468
2469 get_densities <- function(data_subset = NULL, var_value = NULL){
2470
2471 # Support functions for get_densities()
2472
2473 # New version
2474
2475 # Main body of get_densities()
2476
2477 data_subset <- get_modified_data_groups(var_value=var_value)
2478 #data_subset$value_of_interest <- data_subset[,var_value]
2479 from <- min(data_subset$value_of_interest)
2480 to <- max(data_subset$value_of_interest)
2481 xlim = range(data_subset$value_of_interest)
2482
2483 groups <- sort(unique(data_subset$group_decoy_input_score), decreasing = TRUE)
2484 n_groups <- length(groups)
2485
2486 densities <- list(var_value = var_value, groups=groups)
2487
2488 for (i in 1:n_groups){
2489 group <- groups[i]
2490
2491 data_group_single <- subset(data_subset, (group_decoy_input_score == group))
2492 d_group <- with(data_group_single , density(value_of_interest, from = from, to = to))
2493 d_group <- normalize_density(d_group)
2494
2495 densities[[group]] <- d_group
2496 }
2497
2498 return(densities)
2499
2500 }
2501 get_xlim <- function(densities_a = NULL, densities_b = NULL){
2502 groups <- densities_a$groups
2503
2504 xlim <- 0
2505 for (group in groups){
2506 xlim <- range(xlim, densities_a[[group]]$x, densities_b[[group]]$x)
2507 }
2508
2509 return(xlim)
2510
2511 }
2512 get_ylim <- function(densities_a = NULL, densities_b = NULL){
2513 groups <- densities_a$groups
2514
2515 ylim <- 0
2516 for (group in groups){
2517 ylim <- range(ylim, densities_a[[group]]$y, densities_b[[group]]$y)
2518 }
2519
2520 return(ylim)
2521
2522 }
2523 plot_distributions <- function(densities = NULL, var_value= NULL,include_peak_dots=TRUE, xlab_modifier="", xlim=NULL, ylim=NULL, ...){
2524 data_groups <- get_modified_data_groups(var_value=var_value)
2525 groups <- sort(unique(data_groups$group_decoy_input_score))
2526 n_groups <- length(groups)
2527
2528 groups_std <- setdiff(groups, c("100", "decoy", "0") )
2529 groups_std <- sort(groups_std, decreasing = TRUE)
2530 groups_std <- c(groups_std, "0")
2531 n_std <- length(groups_std)
2532 cols <- rainbow_with_fixed_intensity(n = n_std, goal_intensity_0_1 = 0.5, alpha=0.5)
2533
2534 leg <- list(group = c("100" , groups_std , "decoy" ),
2535 leg = c("100" , groups_std , "All Decoys" ),
2536 col = c(col2hex("black") , cols , col2hex("purple", col_alpha = 0.5)),
2537 lwd = c(4 , rep(2, n_std), 4 ),
2538 title = "Confidence Score")
2539
2540 xlab = sprintf("Precursor Mass Discrepancy%s (ppm)",
2541 xlab_modifier)
2542 ylab = "Density"
2543 if (!include_text){
2544 xlab=""
2545 ylab=""
2546 }
2547
2548
2549 plot( x=xlim, y=ylim, col=leg$col[1], lwd=leg$lwd[1] * scale, main=main, xlab=xlab, ylab=ylab, xaxt="n", yaxt="n", cex=scale, type="n")#, lty=leg.lty[1], ...)
2550
2551 include_peak_dots = FALSE # BUGBUG: Disabling this for now. Need to move this to class parameter
2552
2553 for (i in 1:length(leg$group)){
2554 group <- leg$group[i]
2555 d <- densities[[group]]
2556 lines(d, col=leg$col[i], lwd=leg$lwd[i] * scale)
2557 if (include_peak_dots){
2558 x=d$x[which.max(d$y)]
2559 y=max(d$y)
2560 points(x=c(x,x), y=c(0,y), pch=19, col=leg$col[i], cex=scale)
2561 }
2562 }
2563
2564 abline(v=0, lwd=scale)
2565
2566 if (include_text){
2567 legend_object = Legend_Object$new(leg, scale)
2568 legend_object$show()
2569 }
2570
2571 }
2572
2573 # Main body for plot_density_PMD_by_score()
2574
2575 data_groups <- data_processor()$data_groups$df
2576
2577 data_subset_a <- subset(data_groups , used_to_find_middle == FALSE)
2578 data_subset_b <- subset(data_subset_a, PMD_FDR_peptide_length > 11)
2579
2580 densities_a <- get_densities(data_subset = data_subset_a, var_value = "value")
2581 densities_b <- get_densities(data_subset = data_subset_b, var_value = "value_norm")
2582
2583 xlim=get_xlim(densities_a, densities_b)
2584 ylim=get_ylim(densities_a, densities_b)
2585
2586 dataset_name <- data_processor()$info$collection_name()
2587 if (show_norm){
2588 plot_distributions(densities=densities_b, var_value = "value_norm", xlab_modifier = " - normalized", xlim=xlim, ylim=ylim)
2589 } else {
2590 plot_distributions(densities=densities_a, var_value = "value" , xlab_modifier = "" , xlim=xlim, ylim=ylim)
2591 }
2592 }
2593 )
2594 ###############################################################################
2595 # Class: Plot_Dataset_Description
2596 ###############################################################################
2597 Plot_Dataset_Description = setRefClass("Plot_Dataset_Description",
2598 contains = "Plot_Multiple_Images",
2599 fields = list(ylim_time_invariance = "numeric"))
2600 Plot_Dataset_Description$methods(
2601 initialize = function(p_data_processors = NULL,
2602 p_include_text=TRUE,
2603 p_include_main=FALSE,
2604 p_ylim_time_invariance = c(-4,4), ...){
2605 plot_object_r1_c1 <- Plot_Time_Invariance_Alt$new(p_data_processors=p_data_processors,
2606 p_include_text=p_include_text,
2607 p_include_main=p_include_main,
2608 p_training_class = "good_testing",
2609 p_field_of_interest = "value",
2610 p_ylim = p_ylim_time_invariance)
2611
2612 plot_object_r1_c2 <- Plot_Time_Invariance_Alt$new(p_data_processors=p_data_processors,
2613 p_include_text=p_include_text,
2614 p_include_main=p_include_main,
2615 p_training_class = "good_testing",
2616 p_field_of_interest = "value_norm",
2617 p_ylim = p_ylim_time_invariance)
2618 plot_object_r2_c1 <- Plot_Density_PMD_by_Score$new(p_data_processors=p_data_processors,
2619 p_show_norm=FALSE,
2620 p_include_text=p_include_text,
2621 p_include_main=p_include_main)
2622
2623 plot_object_r2_c2 <- Plot_Density_PMD_and_Norm_Decoy_by_AA_Length$new(p_data_processors=p_data_processors,
2624 p_show_norm=FALSE,
2625 p_include_text=p_include_text,
2626 p_include_main=p_include_main)
2627
2628 plot_object_r3_c1 <- Plot_Density_PMD_by_Score$new(p_data_processors=p_data_processors,
2629 p_show_norm=TRUE,
2630 p_include_text=p_include_text,
2631 p_include_main=p_include_main)
2632 plot_object_r3_c2 <- Plot_Density_PMD_and_Norm_Decoy_by_AA_Length$new(p_data_processors=p_data_processors,
2633 p_show_norm=TRUE,
2634 p_include_text=p_include_text,
2635 p_include_main=p_include_main)
2636 callSuper(p_n_images_wide=2,
2637 p_n_images_tall=3,
2638 p_include_text=p_include_text,
2639 p_include_main=p_include_main,
2640 p_image_list = list(plot_object_r1_c1, plot_object_r1_c2,
2641 plot_object_r2_c1, plot_object_r2_c2,
2642 plot_object_r3_c1, plot_object_r3_c2), ...)
2643
2644 }
2645 )
2646 ###############################################################################
2647 # Class: Plots_for_Paper
2648 ###############################################################################
2649 Plots_for_Paper <- setRefClass("Plots_for_Paper", fields =list(data_processor_a = "Data_Processor",
2650 data_processor_b = "Data_Processor",
2651 data_processor_c = "Data_Processor",
2652 data_processor_d = "Data_Processor",
2653 include_text = "logical",
2654 include_main = "logical",
2655 mai = "numeric"))
2656 Plots_for_Paper$methods(
2657 initialize = function(){
2658 data_processor_a <<- Data_Processor$new(p_info = Data_Object_Info_737_two_step$new())
2659 data_processor_b <<- Data_Processor$new(p_info = Data_Object_Info_737_combined$new())
2660 data_processor_c <<- Data_Processor$new(p_info = Data_Object_Pyrococcus_tr $new())
2661 data_processor_d <<- Data_Processor$new(p_info = Data_Object_Mouse_Mutations $new())
2662 },
2663 create_plots_for_paper = function(include_main=TRUE, finalize=TRUE){
2664 print_table_4_data()
2665 print_figure_2_data()
2666 plot_figure_D(p_scale=ifelse(finalize, 2, 1), p_include_main = include_main)
2667 plot_figure_C(p_scale=ifelse(finalize, 2, 1), p_include_main = include_main)
2668 plot_figure_B(p_scale=ifelse(finalize, 2, 1), p_include_main = include_main)
2669 plot_figure_A(p_scale=ifelse(finalize, 2, 1), p_include_main = include_main)
2670 plot_figure_8(p_scale=ifelse(finalize, 2, 1), p_include_main = include_main)
2671 plot_figure_7(p_scale=ifelse(finalize, 2, 1), p_include_main = include_main)
2672 plot_figure_6(p_scale=ifelse(finalize, 4, 1), p_include_main = include_main)
2673 plot_figure_5(p_scale=ifelse(finalize, 2, 1), p_include_main = include_main)
2674 plot_figure_4(p_scale=ifelse(finalize, 2, 1), p_include_main = include_main)
2675 plot_figure_3(p_scale=ifelse(finalize, 4, 1), p_include_main = include_main)
2676 },
2677 print_figure_2_data = function(){
2678 print(create_stats_for_grouping_figure(list(data_processor_a)))
2679 },
2680 print_table_4_data = function(){
2681 report_ranges_of_comparisons(processors = list(data_processor_a))
2682 report_ranges_of_comparisons(processors = list(data_processor_c))
2683 },
2684 plot_figure_3 = function(p_scale=NULL, p_include_main=NULL){
2685 plot_object <- Plot_Compare_PMD_and_Norm_Density$new(p_data_processor = list(data_processor_a),
2686 p_show_norm = FALSE,
2687 p_include_text = TRUE,
2688 p_include_main = p_include_main,
2689 p_display_n_psms = FALSE)
2690 plot_object$plot_image_in_small_window(p_scale=p_scale)
2691 },
2692 plot_figure_4 = function(p_scale=NULL, p_include_main=NULL){
2693 plot_object <- Plot_Time_Invariance_Alt_Before_and_After$new(p_data_processors = list(data_processor_a),
2694 p_include_text=TRUE,
2695 p_include_main=p_include_main,
2696 p_ylim = c(-4,4))
2697 plot_object$plot_image_in_large_window(window_height=4, p_scale=p_scale)
2698
2699 },
2700 plot_figure_5 = function(p_scale=NULL, p_include_main=NULL){
2701 plot_object <- Plot_Density_PMD_and_Norm_Decoy_by_AA_Length$new(p_data_processors = list(data_processor_a),
2702 p_include_text=TRUE,
2703 p_include_main=p_include_main)
2704 plot_object$plot_image_in_large_window(window_height=4, p_scale=p_scale)
2705 },
2706 plot_figure_6 = function(p_scale=NULL, p_include_main=NULL){
2707 plot_object <- Plot_Bad_CI$new(p_data_processors = list(data_processor_a),
2708 p_include_text=TRUE,
2709 p_include_main=p_include_main)
2710 plot_object$plot_image_in_small_window(p_scale=p_scale)
2711 },
2712 plot_figure_7 = function(p_scale=NULL, p_include_main=NULL){
2713 plot_object <- Plot_Compare_iFDR_Confidence_1_Percent_TD_FDR$new(p_data_processors = list(data_processor_a),
2714 p_include_text=TRUE,
2715 p_include_main=p_include_main)
2716 plot_object$plot_image_in_large_window(window_height=4, p_scale=p_scale)
2717 },
2718 plot_figure_8 = function(p_scale=NULL, p_include_main=NULL){
2719 plot_object <- Plot_Selective_Loss$new(p_data_processors = list(data_processor_c),
2720 p_include_text=TRUE,
2721 p_include_main=p_include_main)
2722 plot_object$plot_image_in_large_window(window_height=4, p_scale=p_scale)
2723 },
2724 plot_figure_A = function(p_scale=NULL, p_include_main=NULL){
2725 plot_object <- Plot_Dataset_Description$new(p_data_processors=list(data_processor_a),
2726 p_include_text=TRUE,
2727 p_include_main=p_include_main,
2728 p_ylim_time_invariance=c(-4,4) )
2729 plot_object$plot_image_in_large_window(window_height=4, p_scale=p_scale)
2730 },
2731 plot_figure_B = function(p_scale=NULL, p_include_main=NULL){
2732 plot_object <- Plot_Dataset_Description$new(p_data_processors=list(data_processor_b),
2733 p_include_text=TRUE,
2734 p_include_main=p_include_main,
2735 p_ylim_time_invariance=c(-4,4) )
2736 plot_object$plot_image_in_large_window(window_height=4, p_scale=p_scale)
2737 },
2738 plot_figure_C = function(p_scale=NULL, p_include_main=NULL){
2739 plot_object <- Plot_Dataset_Description$new(p_data_processors=list(data_processor_c),
2740 p_include_text=TRUE,
2741 p_include_main=p_include_main,
2742 p_ylim_time_invariance=c(-4,4) )
2743 plot_object$plot_image_in_large_window(window_height=4, p_scale=p_scale)
2744 },
2745 plot_figure_D = function(p_scale=NULL, p_include_main=NULL){
2746 plot_object <- Plot_Dataset_Description$new(p_data_processors=list(data_processor_d),
2747 p_include_text=TRUE,
2748 p_include_main=p_include_main,
2749 p_ylim_time_invariance=c(-4,4) )
2750 plot_object$plot_image_in_large_window(window_height=4, p_scale=p_scale)
2751 },
2752 create_stats_for_grouping_figure = function(processors=NULL){
2753 processor <- processors[[1]]
2754 processor$i_fdr$ensure()
2755 aug_i_fdr <- processor$i_fdr$df
2756 aug_i_fdr$group_good_bad_other <- gsub("_.*", "", aug_i_fdr$group_training_class)
2757 aug_i_fdr$group_null <- "all"
2758 table(aug_i_fdr$group_training_class)
2759 table(aug_i_fdr$group_good_bad_other)
2760 table(aug_i_fdr$group_null)
2761
2762 create_agg_fdr_stats <- function(i_fdr=NULL, grouping_var_name = NULL){
2763 formula_fdr <- as.formula(sprintf("%s~%s", "i_fdr", grouping_var_name))
2764 formula_len <- as.formula(sprintf("%s~%s", "PMD_FDR_peptide_length", grouping_var_name))
2765 agg_fdr <- aggregate(formula=formula_fdr, data=i_fdr, FUN=mean)
2766 agg_n <- aggregate(formula=formula_fdr, data=i_fdr, FUN=length)
2767 agg_len <- aggregate(formula=formula_len, data=i_fdr, FUN=mean)
2768 agg_fdr <- rename_columns(df = agg_fdr,
2769 names_before = c(grouping_var_name, "i_fdr"),
2770 names_after = c("group" , "fdr"))
2771 agg_n <- rename_columns(df = agg_n,
2772 names_before = c(grouping_var_name, "i_fdr"),
2773 names_after = c("group" , "n"))
2774 agg_len <- rename_columns(df = agg_len,
2775 names_before = c(grouping_var_name),
2776 names_after = c("group" ))
2777 agg <- merge(agg_fdr, agg_n)
2778 agg <- merge(agg , agg_len)
2779
2780 return(agg)
2781 }
2782
2783 agg_detail <- create_agg_fdr_stats(i_fdr = aug_i_fdr, grouping_var_name = "group_training_class")
2784 agg_grouped <- create_agg_fdr_stats(i_fdr = aug_i_fdr, grouping_var_name = "group_good_bad_other")
2785 agg_all <- create_agg_fdr_stats(i_fdr = aug_i_fdr, grouping_var_name = "group_null")
2786
2787 agg <- rbind(agg_detail, agg_grouped)
2788 agg <- rbind(agg, agg_all)
2789
2790 agg$fdr <- ifelse(agg$fdr < 1, agg$fdr, 1)
2791
2792 linear_combo <- function(x=NULL, a0=NULL, a1=NULL){
2793 result <- (a0 * (1-x) + a1 * x)
2794 return(result)
2795 }
2796
2797 agg$r <- linear_combo(agg$fdr, a0=197, a1= 47)
2798 agg$g <- linear_combo(agg$fdr, a0= 90, a1= 85)
2799 agg$b <- linear_combo(agg$fdr, a0= 17, a1=151)
2800
2801 return(agg)
2802 },
2803 report_ranges_of_comparisons = function(processors=NULL){
2804 report_comparison_of_Confidence_and_PMD = function (i_fdr = NULL, min_conf=NULL, max_conf=NULL, include_max=FALSE){
2805 report_PMD_confidence_comparison_from_subset = function(data_subset=NULL, group_name=NULL){
2806 print(group_name)
2807 print(sprintf(" Number of PSMs: %d", nrow(data_subset)))
2808 mean_confidence <- mean(data_subset$PMD_FDR_input_score)
2809 print(sprintf(" Mean Confidence Score: %3.1f", mean_confidence))
2810 print(sprintf(" PeptideShaker g-FDR: %3.1f", 100-mean_confidence))
2811 mean_PMD_FDR = mean(data_subset$i_fdr)
2812 print(sprintf(" PMD g-FDR: %3.1f", 100*mean_PMD_FDR))
2813 #col <- col2hex("black", 0.2)
2814 #plot(data_subset$i_fdr, pch=".", cex=2, col=col)
2815 #abline(h=0)
2816 }
2817
2818 if (is.null(max_conf)) {
2819 data_subset <- subset(i_fdr, PMD_FDR_input_score == min_conf)
2820 group_name <- sprintf("Group %d", min_conf)
2821 } else if (include_max){
2822 data_subset <- subset(i_fdr, (PMD_FDR_input_score >= min_conf) & (PMD_FDR_input_score <= max_conf))
2823 group_name <- sprintf("Group %d through %d", min_conf, max_conf)
2824 } else {
2825 data_subset <- subset(i_fdr, (PMD_FDR_input_score >= min_conf) & (PMD_FDR_input_score < max_conf))
2826 group_name <- sprintf("Group %d to %d", min_conf, max_conf)
2827 }
2828
2829 report_PMD_confidence_comparison_from_subset(data_subset=data_subset, group_name=group_name)
2830 }
2831
2832 processor <- processors[[1]]
2833 processor$i_fdr$ensure()
2834 i_fdr <- processor$i_fdr$df
2835 info <- processor$info
2836 print(sprintf("PMD and Confidence comparison for -- %s", info$collection_name()))
2837 report_comparison_of_Confidence_and_PMD(i_fdr = i_fdr, min_conf=100, max_conf=NULL, include_max=TRUE)
2838 report_comparison_of_Confidence_and_PMD(i_fdr = i_fdr, min_conf= 99, max_conf=100 , include_max=FALSE)
2839 report_comparison_of_Confidence_and_PMD(i_fdr = i_fdr, min_conf= 90, max_conf= 99 , include_max=FALSE)
2840 report_comparison_of_Confidence_and_PMD(i_fdr = i_fdr, min_conf= 0, max_conf=100 , include_max=TRUE)
2841 }
2842 )
2843
2844 ###############################################################################
2845 # C - 021 - PMD-FDR Wrapper - functions.R #
2846 # #
2847 # Creates the necessary structure to convert the PMD-FDR code into one that #
2848 # can run as a batch file #
2849 # #
2850 ###############################################################################
2851 ###############################################################################
2852 # Class: ModuleArgParser_PMD_FDR
2853 ###############################################################################
2854 ModuleArgParser_PMD_FDR <- setRefClass("ModuleArgParser_PMD_FDR",
2855 contains = c("ArgParser"),
2856 fields =list(args = "character") )
2857 ModuleArgParser_PMD_FDR$methods(
2858 initialize = function(description = "Computes individual and global FDR using Precursor Mass Discrepancy (PMD-FDR)", ...){
2859 callSuper(description=description, ...)
2860 local_add_argument("--psm_report" , help="full name and path to the PSM report")
2861 local_add_argument("--psm_report_1_percent", default = "" , help="full name and path to the PSM report for 1% FDR")
2862 local_add_argument("--output_i_fdr" , default = "" , help="full name and path to the i-FDR output file ")
2863 local_add_argument("--output_g_fdr" , default = "" , help="full name and path to the g-FDR output file ")
2864 local_add_argument("--output_densities" , default = "" , help="full name and path to the densities output file ")
2865 local_add_argument("--score_field_name" , default = "" , help="name of score field (in R format)")
2866 local_add_argument("--input_file_type" , default = "PMD_FDR_input_file", help="type of input file (currently supports: PSM_Report)")
2867 }
2868 )
2869 ###############################################################################
2870 # Class: Data_Object_Parser
2871 ###############################################################################
2872 Data_Object_Parser <- setRefClass("Data_Object_Parser",
2873 contains = c("Data_Object"),
2874 fields =list(parser = "ModuleArgParser_PMD_FDR",
2875 args = "character",
2876 parsing_results = "list") )
2877 Data_Object_Parser$methods(
2878 initialize = function(){
2879 callSuper()
2880 class_name <<- "Data_Object_Parser"
2881 },
2882 verify = function(){
2883 # Nothing to do here - parser handles verification during load
2884 },
2885 m_load_data = function(){
2886 if (length(args) == 0){
2887 parsing_results <<- parser$parse_arguments(NULL)
2888 } else {
2889 parsing_results <<- parser$parse_arguments(args)
2890 }
2891
2892 },
2893 set_args = function(p_args=NULL){
2894 # This is primarily used for testing. In operation arguments will be passed automatically (through use of commandArgs)
2895 args <<- p_args
2896 set_dirty(TRUE)
2897 }
2898 )
2899 ###############################################################################
2900 # Class: Data_Object_Info_Parser
2901 ###############################################################################
2902 Data_Object_Info_Parser <- setRefClass("Data_Object_Info_Parser",
2903 contains = c("Data_Object_Info"),
2904 fields =list(
2905 output_i_fdr = "character",
2906 output_g_fdr = "character",
2907 output_densities = "character"
2908 ) )
2909 Data_Object_Info_Parser$methods(
2910 initialize = function(){
2911 callSuper()
2912 class_name <<- "Data_Object_Info_Parser"
2913 },
2914 verify = function(){
2915 check_field_exists = function(field_name=NULL, check_empty = TRUE){
2916 field_value <- get_parser()$parsing_results[field_name]
2917 checkTrue(! is.null(field_value),
2918 msg = sprintf("Parameter %s was not passed to PMD_FDR", field_value))
2919 if (check_empty){
2920 checkTrue(! is.null(field_value),
2921 msg = sprintf("Parameter %s was not passed to PMD_FDR", field_value))
2922 }
2923 }
2924 # Check parameters passed in
2925 check_field_exists("junk")
2926 check_field_exists("psm_report")
2927 check_field_exists("psm_report_1_percent", check_empty = FALSE)
2928 check_field_exists("output_i_fdr" , check_empty = FALSE)
2929 check_field_exists("output_g_fdr" , check_empty = FALSE)
2930 check_field_exists("output_densities" , check_empty = FALSE)
2931 check_field_exists("score_field_name")
2932 check_field_exists("input_file_type")
2933 },
2934 m_load_data = function(){
2935 parsing_results <- get_parser()$parsing_results
2936
2937 data_file_name <<- as.character(parsing_results["psm_report"])
2938 data_file_name_1_percent_FDR <<- as.character(parsing_results["psm_report_1_percent"])
2939 data_path_name <<- as.character(parsing_results[""])
2940 #experiment_name <<- data_file_name
2941 #designation <<- ""
2942 output_i_fdr <<- as.character(parsing_results["output_i_fdr"])
2943 output_g_fdr <<- as.character(parsing_results["output_g_fdr"])
2944 output_densities <<- as.character(parsing_results["output_densities"])
2945
2946 score_field_name <<- as.character(parsing_results["score_field_name"])
2947 set_input_file_type( as.character(parsing_results["input_file_type"]))
2948 },
2949 set_parser = function(parser){
2950 parents[["parser"]] <<- parser
2951 },
2952 get_parser = function(){
2953 return(verified_element_of_list(parents, "parser", "Data_Object_Info_Parser$parents"))
2954 },
2955 file_path = function(){
2956 result <- data_file_name # Now assumes that full path is provided
2957 if (length(result) == 0){
2958 stop("Unable to validate file path - file name is missing")
2959 }
2960 return(result)
2961 },
2962 file_path_1_percent_FDR = function(){
2963 local_file_name <- get_data_file_name_1_percent_FDR()
2964 if (length(local_file_name) == 0){
2965 result <- ""
2966 } else {
2967 result <- local_file_name # path name is no longer relevant
2968 }
2969
2970 # Continue even if file name is missing - not all analyses have a 1 percent FDR file; this is managed downstream
2971
2972 # if (length(result) == 0){
2973 # stop("Unable to validate file path - one or both of path name and file name (of 1 percent FDR file) are missing")
2974 # }
2975 return(result)
2976 },
2977 get_data_file_name_1_percent_FDR = function(){
2978 return(data_file_name_1_percent_FDR)
2979 },
2980 collection_name = function(){
2981 result <- ""
2982 return(result)
2983 },
2984 get_field_name_of_score = function(){
2985 if (length(score_field_name) == 0){
2986 stop("score_field_name has not been set for this project")
2987 }
2988 return(score_field_name)
2989 },
2990 set_input_file_type = function(p_input_file_type=NULL){
2991 if (p_input_file_type == "PSM_Report"){
2992 # do nothing, for now
2993 }
2994 else if (p_input_file_type == "PMD_FDR_input_file"){
2995 score_field_name <<- "PMD_FDR_input_score"
2996 }
2997 else {
2998 stop(sprintf("input_file_type ('%s') is not currently supported - file_type not changed", p_input_file_type))
2999 }
3000 m_input_file_type <<- p_input_file_type
3001 },
3002 get_input_file_type = function(){
3003 if (length(m_input_file_type) == 0){
3004 stop("input_file_type has not been set yet (null string)")
3005 }
3006 if (m_input_file_type == ""){
3007 stop("input_file_type has not been set yet")
3008 }
3009
3010 return(m_input_file_type)
3011 }
3012
3013 )
3014 ###############################################################################
3015 # Class: Processor_PMD_FDR_for_Galaxy
3016 # Purpose: Wrapper on tools from Project 019 to enable a Galaxy-based interface
3017 ###############################################################################
3018 Processor_PMD_FDR_for_Galaxy <- setRefClass("Processor_PMD_FDR_for_Galaxy",
3019 fields = list(
3020 parser = "Data_Object_Parser",
3021 info = "Data_Object_Info_Parser",
3022 raw_data = "Data_Object_Raw_Data",
3023 raw_1_percent = "Data_Object_Raw_1_Percent",
3024 data_converter = "Data_Object_Data_Converter",
3025 data_groups = "Data_Object_Groupings",
3026 densities = "Data_Object_Densities",
3027 alpha = "Data_Object_Alpha",
3028 i_fdr = "Data_Object_Individual_FDR"
3029 ))
3030 Processor_PMD_FDR_for_Galaxy$methods(
3031 initialize = function(){
3032 # This initialization defines all of the dependencies between the various components
3033 # (Unfortunately, inheriting from Data_Processor leads to issues - I had to reimplement it here with a change to "info")
3034
3035 # info
3036 info$set_parser(parser)
3037 parser$append_child(info)
3038
3039 # raw_data
3040 raw_data$set_info(info)
3041 info$append_child(raw_data)
3042
3043 # raw_1_percent
3044 raw_1_percent$set_info(info)
3045 info$append_child(raw_1_percent)
3046
3047 # data_converter
3048 data_converter$set_info (info)
3049 data_converter$set_raw_data(raw_data)
3050 info $append_child (data_converter)
3051 raw_data $append_child (data_converter)
3052
3053 # data_groups
3054 data_groups$set_info (info)
3055 data_groups$set_data_converter(data_converter)
3056 data_groups$set_raw_1_percent (raw_1_percent)
3057 info $append_child (data_groups)
3058 data_converter$append_child (data_groups)
3059 raw_1_percent $append_child (data_groups)
3060
3061 # densities
3062 densities $set_data_groups(data_groups)
3063 data_groups$append_child (densities)
3064
3065 # alpha
3066 alpha $set_densities(densities)
3067 densities$append_child (alpha)
3068
3069 # i_fdr
3070 i_fdr$set_data_groups(data_groups)
3071 i_fdr$set_densities (densities)
3072 i_fdr$set_alpha (alpha)
3073 data_groups $append_child(i_fdr)
3074 densities $append_child(i_fdr)
3075 alpha $append_child(i_fdr)
3076
3077 },
3078 compute = function(){
3079 #i_fdr is currently the lowest level object - it ultimately depends on everything else.
3080 i_fdr$ensure() # All pieces on which i_fdr depends are automatically verified and computed (through their verify() and ensure())
3081
3082 save_standard_df(x = densities$df, file_path = info$output_densities)
3083 save_standard_df(x = alpha$df, file_path = info$output_g_fdr)
3084 save_standard_df(x = i_fdr$df, file_path = info$output_i_fdr)
3085 }
3086 )
3087 ###############################################################################
3088 # D - 021 - PMD-FDR Main.R #
3089 # #
3090 # File Description: Contains the base code that interprets the parameters #
3091 # and computes i-FDR and g-FDR for a mass spec project #
3092 # #
3093 ###############################################################################
3094 argv <- commandArgs(TRUE) # Saves the parameters (command code)
3095
3096 processor <- Processor_PMD_FDR_for_Galaxy$new()
3097 processor$parser$set_args(argv)
3098 processor$compute()
3099