Mercurial > repos > galaxyp > pdm_fdr
comparison PMD_FDR_package_for_Galaxy.R @ 0:6dfa920caa52 draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/pdm_fdr commit 53f3dc4ef4742606e24ab2b0e899b2dc56acac83
author | galaxyp |
---|---|
date | Fri, 27 Sep 2019 14:36:10 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:6dfa920caa52 |
---|---|
1 ############################################################################### | |
2 # PMD_FDR_package_for_Galaxy.R # | |
3 # # | |
4 # Project 021 - PMD-FDR for Galaxy-P # | |
5 # # | |
6 # Description: Computes iFDR and gFDR on PSMs as a script designed for Galaxy # | |
7 # Note that plotting code has been left in that is not used # | |
8 # in this file; this is the code I used to create figures for # | |
9 # publication. I left it in for potential development of views. # | |
10 # # | |
11 # This file was created by concatenating the following files: # | |
12 # # | |
13 # A - 005 - Parser - ArgParser.R # | |
14 # B - 019 - PMD-FDR - functions.R # | |
15 # C - 021 - PMD-FDR Wrapper - functions.R # | |
16 # D - 021 - PMD-FDR Main.R # | |
17 # # | |
18 # Required packages: argparser # | |
19 # stringr # | |
20 # # | |
21 # Release date: 2019-09-26 # | |
22 # Version: 1.2 # | |
23 # # | |
24 ############################################################################### | |
25 # Package currently supports the following parameters: | |
26 # | |
27 # --psm_report full name and path to the PSM report | |
28 # --psm_report_1_percent full name and path to the PSM report for 1% FDR | |
29 # --output_i_fdr full name and path to the i-FDR output file | |
30 # --output_g_fdr full name and path to the g-FDR output file | |
31 # --output_densities full name and path to the densities output file | |
32 # -- | |
33 # | |
34 ############################################################################### | |
35 # A - 005 - Parser - ArgParser.R # | |
36 # # | |
37 # Description: Wrapper for argparser package, using RefClass # | |
38 # # | |
39 ############################################################################### | |
40 | |
41 #install.packages("argparser") | |
42 library(argparser) | |
43 | |
44 # Class definition | |
45 | |
46 ArgParser <- setRefClass("ArgParser", | |
47 fields = c("parser")) | |
48 ArgParser$methods( | |
49 initialize = function(...){ | |
50 parser <<- arg_parser(...) | |
51 }, | |
52 local_add_argument = function(...){ | |
53 parser <<- add_argument(parser, ...) | |
54 }, | |
55 parse_arguments = function(...){ | |
56 result = parse_args(parser, ...) | |
57 return(result) | |
58 } | |
59 ) | |
60 | |
61 ############################################################################### | |
62 # B - 019 - PMD-FDR - functions.R # | |
63 # # | |
64 # Primary work-horse for PMD-FDR # | |
65 # # | |
66 ############################################################################### | |
67 ############################################################################### | |
68 ####### Load libraries etc. | |
69 ############################################################################### | |
70 library(stringr) | |
71 library(RUnit) | |
72 | |
73 ############################################################# | |
74 ####### Global values (should be parameters to module) | |
75 ############################################################# | |
76 | |
77 MIN_GOOD_PEPTIDE_LENGTH <- 11 | |
78 | |
79 ############################################################# | |
80 ####### General purpose functions | |
81 ############################################################# | |
82 # Creates a more useful error report when file is not reasonable | |
83 safe_file_exists <- function(file_path){ # Still not particularly useful in cases where it is a valid directory | |
84 tryCatch( | |
85 return(file_test(op = "-f", x=file_path)), | |
86 error=function(e) {simpleError(sprintf("file path is not valid: '%s'", file_path))} | |
87 ) | |
88 } | |
89 # My standard way of loading data into data.frames | |
90 load_standard_df <- function(file_path=NULL){ | |
91 clean_field_names = function(field_names){ | |
92 result <- field_names | |
93 idx_blank <- which(result == "") | |
94 result[idx_blank] <- sprintf("<Field %d>", idx_blank) | |
95 return(result) | |
96 } | |
97 if (safe_file_exists(file_path)){ | |
98 field_names <- read_field_names(file_path, sep = "\t") | |
99 field_names <- clean_field_names(field_names) | |
100 data <- read.table(file = file_path, header = TRUE, sep = "\t", stringsAsFactors = FALSE, blank.lines.skip = TRUE)#, check.names = FALSE) | |
101 colnames(data) = field_names | |
102 } else { | |
103 stop(sprintf("File path does not exist: '%s'", file_path)) | |
104 } | |
105 return(data) | |
106 } | |
107 save_standard_df <- function(x=NULL, file_path=NULL){ | |
108 if (file_path != ""){ | |
109 write.table(x = x, file = file_path, quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE) | |
110 } | |
111 } | |
112 rename_column <- function(df=NULL, name_before=NULL, name_after=NULL, suppressWarnings=FALSE){ | |
113 if (is.null(df)){ | |
114 stop("Dataframe (df) does not exist - unable to rename column") | |
115 } | |
116 if (name_before %in% colnames(df)){ | |
117 df[,name_after] <- df[,name_before] | |
118 df[,name_before] <- NULL | |
119 } else if (!suppressWarnings){ | |
120 warning(sprintf("'%s' is not a field in the data frame and so has not been renamed", name_before)) | |
121 } | |
122 return(df) | |
123 } | |
124 rename_columns <- function(df=NULL, names_before=NULL, names_after=NULL){ | |
125 for (i in safe_iterator(length(names_before))){ | |
126 df <- rename_column(df, names_before[i], names_after[i]) | |
127 } | |
128 return(df) | |
129 } | |
130 round_to_tolerance <- function(x=NULL, tolerance=NULL, ...){ | |
131 return(function_to_tolerance(x=x, tolerance=tolerance, FUN=round, ...)) | |
132 } | |
133 function_to_tolerance <- function(x=NULL, tolerance=NULL, FUN=NULL, ...){ | |
134 return(FUN(x/tolerance, ...) * tolerance) | |
135 } | |
136 safe_median <- function(x) median(x, na.rm=TRUE) | |
137 normalize_density <- function(d){ | |
138 # Normalizes y-values in density function | |
139 # so that the integral under the curve is 1 | |
140 # (uses rectangles to approximate area) | |
141 delta_x <- diff(range(d$x)) / length(d$x) | |
142 unnormalized_integral <- delta_x * sum(d$y) | |
143 new_d <- d | |
144 new_d$y <- with(new_d, y ) | |
145 | |
146 return(new_d) | |
147 } | |
148 if_null <- function(cond=NULL, null_result=NULL, not_null_result=NULL){ | |
149 return(switch(1+is.null(cond), | |
150 not_null_result, | |
151 null_result)) | |
152 } | |
153 rainbow_with_fixed_intensity <- function(n=NULL, goal_intensity_0_1=NULL, alpha=NULL){ | |
154 goal_intensity <- 255*goal_intensity_0_1 | |
155 hex_colors <- rainbow(n) | |
156 rgb_colors <- col2rgb(hex_colors) | |
157 df_colors <- data.frame(t(rgb_colors)) | |
158 df_colors$intensity <- with(df_colors, 0.2989*red + 0.5870*green + 0.1140*blue) | |
159 | |
160 df_colors$white_black <- with(df_colors, ifelse(intensity < goal_intensity, 255, 0)) | |
161 df_colors$mix_level <- with(df_colors, (white_black - goal_intensity) / (white_black - intensity ) ) | |
162 df_colors$new_red <- with(df_colors, mix_level*red + (1-mix_level)*white_black) | |
163 df_colors$new_green <- with(df_colors, mix_level*green + (1-mix_level)*white_black) | |
164 df_colors$new_blue <- with(df_colors, mix_level*blue + (1-mix_level)*white_black) | |
165 names_pref_new <- c("new_red", "new_green", "new_blue") | |
166 names_no_pref <- c("red", "green", "blue") | |
167 df_colors <- df_colors[,names_pref_new] | |
168 df_colors <- rename_columns(df_colors, names_before = names_pref_new, names_after = names_no_pref) | |
169 rgb_colors <-as.matrix(df_colors/255 ) | |
170 | |
171 return(rgb(rgb_colors, alpha=alpha)) | |
172 } | |
173 safe_iterator <- function(n_steps = NULL){ | |
174 if (n_steps < 1){ | |
175 result = numeric(0) | |
176 } else { | |
177 result = 1:n_steps | |
178 } | |
179 return(result) | |
180 } | |
181 col2hex <- function(cols=NULL, col_alpha=255){ | |
182 if (all(col_alpha<=1)){ | |
183 col_alpha <- round(col_alpha*255) | |
184 } | |
185 col_matrix <- t(col2rgb(cols)) | |
186 results <- rgb(col_matrix, alpha=col_alpha, maxColorValue = 255) | |
187 return(results) | |
188 } | |
189 credible_interval <- function(x=NULL, N=NULL, precision=0.001, alpha=0.05){ | |
190 # Approximates "highest posterior density interval" | |
191 # Uses exact binomial but with a finite list of potential values (1/precision) | |
192 | |
193 p <- seq(from=0, to=1, by=precision) | |
194 d <- dbinom(x = x, size = N, prob = p) | |
195 d <- d / sum(d) | |
196 df <- data.frame(p=p, d=d) | |
197 df <- df[order(-df$d),] | |
198 df$cumsum <- cumsum(df$d) | |
199 max_idx <- sum(df$cumsum < (1-alpha)) + 1 | |
200 max_idx <- min(max_idx, nrow(df)) | |
201 | |
202 lower <- min(df$p[1:max_idx]) | |
203 upper <- max(df$p[1:max_idx]) | |
204 | |
205 return(c(lower,upper)) | |
206 } | |
207 verified_element_of_list <- function(parent_list=NULL, element_name=NULL, object_name=NULL){ | |
208 if (is.null(parent_list[[element_name]])){ | |
209 if (is.null(object_name)){ | |
210 object_name = "the list" | |
211 } | |
212 stop(sprintf("Element '%s' does not yet exist in %s", element_name, object_name)) | |
213 } | |
214 return(parent_list[[element_name]]) | |
215 } | |
216 read_field_names = function(file_path=NULL, sep = "\t"){ | |
217 con = file(file_path,"r") | |
218 fields = readLines(con, n=1) | |
219 close(con) | |
220 fields = strsplit(x = fields, split = sep)[[1]] | |
221 return(fields) | |
222 } | |
223 check_field_name = function(input_df = NULL, name_of_input_df=NULL, field_name=NULL){ | |
224 test_succeeded <- field_name %in% colnames(input_df) | |
225 current_columns <- paste0(colnames(input_df), collapse=", ") | |
226 checkTrue(test_succeeded, | |
227 msg = sprintf("Expected fieldname '%s' in %s (but did not find it among %s)", | |
228 field_name, name_of_input_df, current_columns)) | |
229 } | |
230 | |
231 ############################################################# | |
232 ####### Classes for Data | |
233 ############################################################# | |
234 | |
235 ############################################################################### | |
236 # Class: Data_Object | |
237 ############################################################################### | |
238 Data_Object <- setRefClass("Data_Object", | |
239 fields =list(m_is_dirty = "logical", | |
240 parents = "list", | |
241 children = "list", | |
242 class_name = "character")) | |
243 Data_Object$methods( | |
244 initialize = function(){ | |
245 m_is_dirty <<- TRUE | |
246 class_name <<- "Data_Object <abstract class - class_name needs to be set in subclass>" | |
247 }, | |
248 load_data = function(){ | |
249 #print(sprintf("Calling %s$load_data()", class_name)) # Useful for debugging | |
250 ensure_parents() | |
251 verify() | |
252 m_load_data() | |
253 set_dirty(new_value = FALSE) | |
254 }, | |
255 ensure = function(){ | |
256 if (m_is_dirty){ | |
257 load_data() | |
258 } | |
259 }, | |
260 set_dirty = function(new_value){ | |
261 if (new_value != m_is_dirty){ | |
262 m_is_dirty <<- new_value | |
263 set_children_dirty() | |
264 } | |
265 }, | |
266 verify = function(){ | |
267 stop(sprintf("verify() is an abstract method - define it in %s before calling load_data()", class_name)) | |
268 }, | |
269 m_load_data = function(){ | |
270 stop(sprintf("m_load_data() is an abstract method - define it in %s before calling load_data()", class_name)) | |
271 }, | |
272 append_parent = function(parent=NULL){ | |
273 parents <<- append(parents, parent) | |
274 }, | |
275 append_child = function(child=NULL){ | |
276 children <<- append(children, child) | |
277 }, | |
278 ensure_parents = function(){ | |
279 for (parent in parents){ | |
280 # print(sprintf("Calling %s$ensure()", parent$class_name)) # Useful for debugging | |
281 parent$ensure() | |
282 } | |
283 }, | |
284 set_children_dirty = function(){ | |
285 for (child in children){ | |
286 child$set_dirty(TRUE) | |
287 } | |
288 } | |
289 ) | |
290 ############################################################################### | |
291 # Class: Data_Object_Info | |
292 ############################################################################### | |
293 Data_Object_Info <- setRefClass("Data_Object_Info", | |
294 contains = "Data_Object", | |
295 fields =list( | |
296 data_file_name_1_percent_FDR = "character", | |
297 data_file_name = "character", | |
298 data_path_name = "character", | |
299 experiment_name = "character", | |
300 designation = "character", | |
301 | |
302 m_input_file_type = "character", | |
303 | |
304 score_field_name = "character" | |
305 #collection_name="character", | |
306 #dir_results="character", | |
307 #dir_dataset="character", | |
308 #dataset_designation="character", | |
309 #file_name_dataset="character", | |
310 #file_name_dataset_1_percent="character", | |
311 #experiment_name="character" | |
312 ) ) | |
313 Data_Object_Info$methods( | |
314 initialize = function(){ | |
315 callSuper() | |
316 class_name <<- "Data_Object_Info - <Abstract class - class_name needs to be set in subclass>" | |
317 }, | |
318 verify = function(){ | |
319 checkFieldExists = function(field_name=NULL){ | |
320 field_value <- .self[[field_name]] | |
321 checkTrue(length(field_value) > 0, | |
322 sprintf("Field %s$%s has not been set (and should have been)", class_name, field_name)) | |
323 checkTrue(length(field_value) == 1, | |
324 sprintf("Field %s$%s has been set to multiple values (and should be a single value)", class_name, field_name)) | |
325 checkTrue(field_value != "", | |
326 sprintf("Field %s$%s has been set to an empty string (and should not have been)", class_name, field_name)) | |
327 } | |
328 checkFieldExists("data_file_name") | |
329 checkFieldExists("data_path_name") | |
330 checkFieldExists("experiment_name") | |
331 checkFieldExists("designation") | |
332 checkFieldExists("m_input_file_type") | |
333 checkFieldExists("score_field_name") | |
334 }, | |
335 m_load_data = function(){ | |
336 # Nothing to do - this is really a data class | |
337 }, | |
338 file_path = function(){ | |
339 result <- file.path(data_path_name, data_file_name) | |
340 if (length(result) == 0){ | |
341 stop("Unable to validate file path - one or both of path name and file name are missing") | |
342 } | |
343 return(result) | |
344 }, | |
345 file_path_1_percent_FDR = function(){ | |
346 local_file_name <- get_data_file_name_1_percent_FDR() | |
347 if (length(local_file_name) == 0){ | |
348 result <- "" | |
349 } else { | |
350 result <- file.path(data_path_name, local_file_name) | |
351 } | |
352 | |
353 # Continue even if file name is missing - not all analyses have a 1 percent FDR file; this is managed downstream | |
354 | |
355 # if (length(result) == 0){ | |
356 # stop("Unable to validate file path - one or both of path name and file name (of 1 percent FDR file) are missing") | |
357 # } | |
358 return(result) | |
359 }, | |
360 get_data_file_name_1_percent_FDR = function(){ | |
361 return(data_file_name_1_percent_FDR) | |
362 }, | |
363 collection_name = function(){ | |
364 result <- sprintf("%s_%s", experiment_name, designation) | |
365 return(result) | |
366 }, | |
367 get_field_name_of_score = function(){ | |
368 if (length(score_field_name) == 0){ | |
369 stop("score_field_name has not been set for this project") | |
370 } | |
371 return(score_field_name) | |
372 }, | |
373 set_input_file_type = function(p_input_file_type=NULL){ | |
374 if (p_input_file_type == "PSM_Report"){ | |
375 # do nothing, for now | |
376 } | |
377 else if (p_input_file_type == "PMD_FDR_input_file"){ | |
378 score_field_name <<- "PMD_FDR_input_score" | |
379 } | |
380 else { | |
381 stop(sprintf("input_file_type ('%s') is not currently supported - file_type not changed", p_input_file_type)) | |
382 } | |
383 m_input_file_type <<- p_input_file_type | |
384 }, | |
385 get_input_file_type = function(){ | |
386 if (length(m_input_file_type) == 0){ | |
387 stop("input_file_type has not been set yet (null string)") | |
388 } | |
389 if (m_input_file_type == ""){ | |
390 stop("input_file_type has not been set yet") | |
391 } | |
392 | |
393 return(m_input_file_type) | |
394 } | |
395 ) | |
396 ############################################################################### | |
397 # Class: Data_Object_Info_737_two_step | |
398 ############################################################################### | |
399 Data_Object_Info_737_two_step <- setRefClass("Data_Object_Info_737_two_step", | |
400 contains = "Data_Object_Info", | |
401 fields =list()) | |
402 Data_Object_Info_737_two_step$methods( | |
403 initialize = function(){ | |
404 callSuper() | |
405 class_name <<- "Data_Object_Info_737_two_step" | |
406 score_field_name <<- "Confidence [%]" | |
407 data_file_name_1_percent_FDR <<- "737_NS_Peptide_Shaker_PSM_Report_Multi_Stage_Two_Step.tabular" | |
408 data_file_name <<- "737_NS_Peptide_Shaker_Extended_PSM_Report_Multi_Stage_Two_Step.tabular.tabular" | |
409 data_path_name <<- file.path(".", "Data") | |
410 experiment_name <<- "Oral_737_NS" | |
411 designation <<- "two_step" | |
412 | |
413 set_input_file_type("PSM_Report") | |
414 | |
415 #data_collection_oral_737_NS_combined$file_name_dataset_1_percent = "737_NS_Peptide_Shaker_PSM_Report_CombinedDB.tabular" | |
416 #data_collection_oral_737_NS_two_step$file_name_dataset_1_percent = "737_NS_Peptide_Shaker_PSM_Report_Multi_Stage_Two_Step.tabular" | |
417 | |
418 } | |
419 ) | |
420 | |
421 ############################################################################### | |
422 # Class: Data_Object_Info_737_combined | |
423 ############################################################################### | |
424 Data_Object_Info_737_combined <- setRefClass("Data_Object_Info_737_combined", | |
425 contains = "Data_Object_Info", | |
426 fields =list()) | |
427 Data_Object_Info_737_combined$methods( | |
428 initialize = function(){ | |
429 callSuper() | |
430 class_name <<- "Data_Object_Info_737_combined" | |
431 score_field_name <<- "Confidence [%]" | |
432 data_file_name_1_percent_FDR <<- "737_NS_Peptide_Shaker_PSM_Report_CombinedDB.tabular" | |
433 data_file_name <<- "737_NS_Peptide_Shaker_Extended_PSM_Report_CombinedDB.tabular" | |
434 data_path_name <<- file.path(".", "Data") | |
435 experiment_name <<- "Oral_737_NS" | |
436 designation <<- "two_step" | |
437 | |
438 set_input_file_type("PSM_Report") | |
439 | |
440 #data_collection_oral_737_NS_combined$file_name_dataset_1_percent = "737_NS_Peptide_Shaker_PSM_Report_CombinedDB.tabular" | |
441 #data_collection_oral_737_NS_two_step$file_name_dataset_1_percent = "737_NS_Peptide_Shaker_PSM_Report_Multi_Stage_Two_Step.tabular" | |
442 | |
443 } | |
444 ) | |
445 | |
446 ############################################################################### | |
447 # Class: Data_Object_Pyrococcus_tr | |
448 ############################################################################### | |
449 Data_Object_Pyrococcus_tr <- setRefClass("Data_Object_Pyrococcus_tr", | |
450 contains = "Data_Object_Info", | |
451 fields =list()) | |
452 Data_Object_Pyrococcus_tr$methods( | |
453 initialize = function(){ | |
454 callSuper() | |
455 class_name <<- "Data_Object_Pyrococcus_tr" | |
456 score_field_name <<- "Confidence [%]" | |
457 data_file_name_1_percent_FDR <<- "" | |
458 data_file_name <<- "Pfu_traditional_Extended_PSM_Report.tabular" | |
459 data_path_name <<- file.path(".", "Data") | |
460 experiment_name <<- "Pyrococcus" | |
461 designation <<- "tr" | |
462 | |
463 set_input_file_type("PSM_Report") | |
464 | |
465 } | |
466 ) | |
467 ############################################################################### | |
468 # Class: Data_Object_Mouse_Mutations | |
469 ############################################################################### | |
470 Data_Object_Mouse_Mutations <- setRefClass("Data_Object_Mouse_Mutations", | |
471 contains = "Data_Object_Info", | |
472 fields =list()) | |
473 Data_Object_Mouse_Mutations$methods( | |
474 initialize = function(){ | |
475 callSuper() | |
476 class_name <<- "Data_Object_Mouse_Mutations" | |
477 score_field_name <<- "Confidence [%]" | |
478 data_file_name_1_percent_FDR <<- "" | |
479 data_file_name <<- "Combined_DB_Mouse_5PTM.tabular" | |
480 data_path_name <<- file.path(".", "Data") | |
481 experiment_name <<- "Mouse Mutations" | |
482 designation <<- "combined_05" | |
483 | |
484 set_input_file_type("PSM_Report") | |
485 | |
486 } | |
487 ) | |
488 ############################################################################### | |
489 # Class: Data_Object_Raw_Data | |
490 ############################################################################### | |
491 Data_Object_Raw_Data <- setRefClass("Data_Object_Raw_Data", | |
492 contains = "Data_Object", | |
493 fields =list(df = "data.frame")) | |
494 Data_Object_Raw_Data$methods( | |
495 initialize = function(){ | |
496 callSuper() | |
497 class_name <<- "Data_Object_Raw_Data" | |
498 }, | |
499 verify = function(){ | |
500 # Check that file exists before using it | |
501 file_path <- get_info()$file_path() | |
502 if (! safe_file_exists(file_path)){ | |
503 stop(sprintf("Raw data file does not exist (%s)", file_path)) | |
504 } | |
505 }, | |
506 set_info = function(info){ | |
507 parents[["info"]] <<- info | |
508 }, | |
509 get_info = function(){ | |
510 return(verified_element_of_list(parents, "info", "Data_Object_Raw_Data$parents")) | |
511 }, | |
512 m_load_data = function(){ | |
513 info <- get_info() | |
514 df <<- load_standard_df(info$file_path()) | |
515 } | |
516 ) | |
517 ############################################################################### | |
518 # Class: Data_Object_Raw_1_Percent | |
519 ############################################################################### | |
520 Data_Object_Raw_1_Percent <- setRefClass("Data_Object_Raw_1_Percent", | |
521 contains = "Data_Object", | |
522 fields =list(df = "data.frame")) | |
523 Data_Object_Raw_1_Percent$methods( | |
524 initialize = function(){ | |
525 callSuper() | |
526 class_name <<- "Data_Object_Raw_1_Percent" | |
527 }, | |
528 set_info = function(info){ | |
529 parents[["info"]] <<- info | |
530 }, | |
531 verify = function(){ | |
532 # Do nothing - a missing file name is acceptable for this module and is dealt with in load() | |
533 }, | |
534 get_info = function(){ | |
535 return(verified_element_of_list(parents, "info", "Data_Object_Raw_1_Percent$parents")) | |
536 }, | |
537 m_load_data = function(){ | |
538 | |
539 info <- get_info() | |
540 file_path <- info$file_path_1_percent_FDR() | |
541 if (exists()){ | |
542 df <<- load_standard_df(info$file_path_1_percent_FDR()) | |
543 } # Note that failing to load is a valid state for this file, leading to not is_dirty. BUGBUG: this could lead to problems if a good file appears later | |
544 }, | |
545 exists = function(){ | |
546 | |
547 info <- get_info() | |
548 local_file_name <- info$get_data_file_name_1_percent_FDR() # Check file name not file path | |
549 | |
550 if (length(local_file_name) == 0 ){ # variable not set | |
551 result = FALSE | |
552 } else if (local_file_name == ""){ # variable set to empty string | |
553 result = FALSE | |
554 } else { | |
555 result = safe_file_exists(info$file_path_1_percent_FDR()) | |
556 } | |
557 | |
558 return(result) | |
559 } | |
560 ) | |
561 ############################################################################### | |
562 # Class: Data_Object_Data_Converter | |
563 ############################################################################### | |
564 Data_Object_Data_Converter <- setRefClass("Data_Object_Data_Converter", | |
565 contains = "Data_Object", | |
566 fields =list(df = "data.frame")) | |
567 Data_Object_Data_Converter$methods( | |
568 initialize = function(){ | |
569 callSuper() | |
570 class_name <<- "Data_Object_Data_Converter" | |
571 }, | |
572 currently_supported_file_types = function(){ | |
573 return(c("PSM_Report", "PMD_FDR_input_file")) | |
574 }, | |
575 verify = function(){ | |
576 checkFileTypeSupported = function(){ | |
577 file_type <- get_info()$get_input_file_type() | |
578 | |
579 supported_list <- paste0(currently_supported_file_types(), collapse = " ") | |
580 checkTrue(file_type %in% currently_supported_file_types(), | |
581 sprintf("File type (%s) is not currently supported in Data_Object_Data_Converter (currently support: %s)", | |
582 file_type, supported_list)) | |
583 } | |
584 info <- get_info() | |
585 raw_data <- get_raw_data() | |
586 | |
587 file_type <- info$get_input_file_type() | |
588 | |
589 checkFileTypeSupported() | |
590 | |
591 data_original <- raw_data$df | |
592 if (file_type == "PSM_Report"){ | |
593 check_field_name(data_original, "raw_data", info$get_field_name_of_score()) | |
594 check_field_name(data_original, "raw_data", "Precursor m/z Error [ppm]") | |
595 check_field_name(data_original, "raw_data", "Spectrum File") | |
596 check_field_name(data_original, "raw_data", "Protein(s)") | |
597 check_field_name(data_original, "raw_data", "Spectrum Title") | |
598 check_field_name(data_original, "raw_data", "Decoy") | |
599 check_field_name(data_original, "raw_data", "Sequence") | |
600 } | |
601 if (file_type == "PMD_FDR_input_file"){ | |
602 check_field_name(data_original, "raw_data", "PMD_FDR_input_score") | |
603 check_field_name(data_original, "raw_data", "PMD_FDR_pmd") | |
604 check_field_name(data_original, "raw_data", "PMD_FDR_spectrum_file") | |
605 check_field_name(data_original, "raw_data", "PMD_FDR_proteins") | |
606 check_field_name(data_original, "raw_data", "PMD_FDR_spectrum_title") | |
607 check_field_name(data_original, "raw_data", "PMD_FDR_sequence") | |
608 check_field_name(data_original, "raw_data", "PMD_FDR_decoy") | |
609 } | |
610 | |
611 # if (file_type == "PSM_Report"){ | |
612 # data_new <- raw_data$df | |
613 # field_name_of_score <- info$get_field_name_of_score() | |
614 # data_new$PMD_FDR_input_score <- data_new[, field_name_of_score ] | |
615 # data_new$PMD_FDR_pmd <- data_new[, "Precursor m/z Error [ppm]"] | |
616 # data_new$PMD_FDR_spectrum_file <- data_new[, "Spectrum File" ] | |
617 # data_new$PMD_FDR_proteins <- data_new[, "Protein(s)" ] | |
618 # data_new$PMD_FDR_spectrum_title <- data_new[, "Spectrum Title" ] | |
619 # } | |
620 | |
621 }, | |
622 m_load_data = function(){ | |
623 | |
624 info <- get_info() | |
625 raw_data <- get_raw_data() | |
626 | |
627 file_type <- info$get_input_file_type() | |
628 | |
629 if (file_type == "PSM_Report"){ | |
630 data_new <- raw_data$df | |
631 field_name_of_score <- info$get_field_name_of_score() | |
632 data_new$PMD_FDR_input_score <- data_new[, field_name_of_score ] | |
633 data_new$PMD_FDR_pmd <- data_new[, "Precursor m/z Error [ppm]"] | |
634 data_new$PMD_FDR_spectrum_file <- data_new[, "Spectrum File" ] | |
635 data_new$PMD_FDR_proteins <- data_new[, "Protein(s)" ] | |
636 data_new$PMD_FDR_spectrum_title <- data_new[, "Spectrum Title" ] | |
637 data_new$PMD_FDR_sequence <- data_new[, "Sequence" ] | |
638 data_new$PMD_FDR_decoy <- data_new[, "Decoy" ] | |
639 | |
640 df <<- data_new | |
641 } | |
642 else if (file_type == "PMD_FDR_input_file"){ | |
643 df <<- raw_data$df | |
644 } | |
645 else { | |
646 stop(sprintf("Input file type is not currently supported: '%s'", file_type)) | |
647 } | |
648 | |
649 }, | |
650 set_info = function(info){ | |
651 parents[["info"]] <<- info | |
652 }, | |
653 get_info = function(){ | |
654 return(verified_element_of_list(parents, "info", "Data_Object_Data_Converter$parents")) | |
655 }, | |
656 set_raw_data = function(raw_data){ | |
657 parents[["raw_data"]] <<- raw_data | |
658 }, | |
659 get_raw_data = function(){ | |
660 return(verified_element_of_list(parents, "raw_data", "Data_Object_Data_Converter$parents")) | |
661 } | |
662 ) | |
663 ############################################################################### | |
664 # Class: Data_Object_Groupings | |
665 ############################################################################### | |
666 Data_Object_Groupings <- setRefClass("Data_Object_Groupings", | |
667 contains = "Data_Object", | |
668 fields =list(df = "data.frame")) | |
669 Data_Object_Groupings$methods( | |
670 initialize = function(){ | |
671 callSuper() | |
672 class_name <<- "Data_Object_Groupings" | |
673 }, | |
674 simplify_field_name = function(x=NULL){ | |
675 result <- gsub(pattern = "PMD_FDR_", replacement = "", x = x) | |
676 return(result) | |
677 }, | |
678 verify = function(){ | |
679 data_original <- get_data_converter()$df | |
680 | |
681 check_field_name(data_original, "data_converter", "PMD_FDR_input_score") | |
682 check_field_name(data_original, "data_converter", "PMD_FDR_pmd") | |
683 check_field_name(data_original, "data_converter", "PMD_FDR_spectrum_file") | |
684 check_field_name(data_original, "data_converter", "PMD_FDR_proteins") | |
685 check_field_name(data_original, "data_converter", "PMD_FDR_spectrum_title") | |
686 check_field_name(data_original, "data_converter", "PMD_FDR_sequence") | |
687 check_field_name(data_original, "data_converter", "PMD_FDR_decoy") | |
688 | |
689 }, | |
690 m_load_data = function(){ | |
691 make_data_groups <- function(data_original=NULL){ | |
692 | |
693 # Functions supporting make_data_groups() | |
694 | |
695 standardize_fields <- function(data=NULL){ | |
696 data_new <- data | |
697 | |
698 info <- get_info() | |
699 info$ensure() | |
700 field_name_of_score <- info$get_field_name_of_score() | |
701 | |
702 # #data_new <- rename_column(data_new, "Variable Modifications" , "ptm_list") | |
703 # data_new <- rename_column(data_new, field_name_of_score , "PMD_FDR_input_score") | |
704 # data_new <- rename_column(data_new, "Precursor m/z Error [ppm]", "PMD_FDR_pmd") | |
705 # #data_new <- rename_column(data_new, "Isotope Number" , "isotope_number") | |
706 # #data_new <- rename_column(data_new, "m/z" , "m_z") | |
707 # #data_new <- rename_column(data_new, "Measured Charge" , "charge") | |
708 # data_new <- rename_column(data_new, "Spectrum File" , "PMD_FDR_spectrum_file") | |
709 # data_new <- rename_column(data_new, "Protein(s)" , "PMD_FDR_proteins") | |
710 # data_new <- rename_column(data_new, "Spectrum Title" , "PMD_FDR_spectrum_title") | |
711 # data_new <- manage_decoy_column(data_new) | |
712 | |
713 # Now managed in Data_Converter | |
714 # data_new$PMD_FDR_input_score <- data_new[, field_name_of_score ] | |
715 # data_new$PMD_FDR_pmd <- data_new[, "Precursor m/z Error [ppm]"] | |
716 # data_new$PMD_FDR_spectrum_file <- data_new[, "Spectrum File" ] | |
717 # data_new$PMD_FDR_proteins <- data_new[, "Protein(s)" ] | |
718 # data_new$PMD_FDR_spectrum_title <- data_new[, "Spectrum Title" ] | |
719 | |
720 data_new$value <- data_new$PMD_FDR_pmd | |
721 data_new$PMD_FDR_peptide_length <- str_length(data_new$PMD_FDR_sequence) | |
722 #data_new$charge_value <- with(data_new, as.numeric(substr(charge, start=1, stop=str_length(charge)-1))) | |
723 #data_new$measured_mass <- with(data_new, m_z*charge_value) | |
724 data_new$PMD_FDR_spectrum_index <- NA | |
725 data_new$PMD_FDR_spectrum_index[order(data_new$PMD_FDR_spectrum_title, na.last = TRUE)] <- 1:nrow(data_new) | |
726 | |
727 return(data_new) | |
728 } | |
729 add_grouped_variable <- function(data_groups = data_groups, field_name_to_group = NULL, vec.length.out = NULL, vec.tolerance = NULL, value_format = NULL){ | |
730 | |
731 # Support functions for add_grouped_variable() | |
732 find_interval_vec <- function(x=NULL, length.out = NULL, tolerance = NULL){ | |
733 q <- quantile(x = x, probs = seq(from=0, to=1, length.out = length.out), na.rm=TRUE) | |
734 q <- round_to_tolerance(q, tolerance = tolerance) | |
735 return(q) | |
736 } | |
737 get_group_data_frame <- function(vec=NULL, value_format = NULL){ | |
738 n <- length(vec) | |
739 a <- vec[-n] | |
740 b <- vec[-1] | |
741 | |
742 lower <- ifelse(a == b , "eq", NA) | |
743 lower <- ifelse(is.na(lower ), "ge", lower) | |
744 upper <- ifelse(a == b , "eq", NA) | |
745 upper[n-1] <- ifelse(is.na(upper[n-1]), "le", "eq") | |
746 upper <- ifelse(is.na(upper ), "lt", upper) | |
747 group <- data.frame(list(idx=1:(n-1), a=a, b=b, lower=lower, upper=upper)) | |
748 | |
749 name_format <- sprintf("%%%s_%%%s_%%s_%%s", value_format, value_format) | |
750 group$new_var <- with(group, sprintf(name_format, a, b, lower, upper)) | |
751 | |
752 return(group) | |
753 } | |
754 merge_group_with_data <- function(data_groups = NULL, group = NULL, vec = NULL, field_name_to_group = NULL){ | |
755 field_name_new <- sprintf("group_%s", simplify_field_name(field_name_to_group)) | |
756 group_idx <- findInterval(x = data_groups[,field_name_to_group], | |
757 vec = vec, | |
758 all.inside=TRUE) | |
759 data_groups$new_var <- group$new_var[group_idx] | |
760 data_groups <- rename_column(data_groups, "new_var", field_name_new) | |
761 } | |
762 # Body of add_grouped_variable() | |
763 | |
764 vec <- find_interval_vec(x = data_groups[[field_name_to_group]], | |
765 length.out = vec.length.out, | |
766 tolerance = vec.tolerance ) | |
767 group <- get_group_data_frame(vec = vec, | |
768 value_format = value_format) | |
769 df_new <- merge_group_with_data(data_groups = data_groups, | |
770 group = group, | |
771 vec = vec, | |
772 field_name_to_group = field_name_to_group) | |
773 df_new <- add_group_decoy(df_new, field_name_to_group) | |
774 | |
775 return(df_new) | |
776 } | |
777 add_already_grouped_variable <- function(field_name_to_group = NULL, data_groups = NULL ){ | |
778 old_name <- field_name_to_group | |
779 new_name <- sprintf("group_%s", simplify_field_name(old_name)) | |
780 df_new <- data_groups | |
781 df_new[[new_name]] <- data_groups[[old_name]] | |
782 | |
783 df_new <- add_group_decoy(data_groups = df_new, field_name_to_group = field_name_to_group) | |
784 | |
785 return(df_new) | |
786 } | |
787 add_value_norm <- function(data_groups = NULL){ | |
788 | |
789 df_new <- data_groups | |
790 df_new$value_norm <- with(df_new, value - median_of_group_index) | |
791 | |
792 return(df_new) | |
793 } | |
794 add_protein_group <-function(data_groups = NULL){ | |
795 data_new <- data_groups | |
796 df_group_def <- data.frame(stringsAsFactors = FALSE, | |
797 list(pattern = c("" , "pfu_" , "cRAP"), | |
798 group_name = c("human", "pyrococcus", "contaminant"))) | |
799 for (i in 1:nrow(df_group_def)){ | |
800 idx <- grepl(pattern = df_group_def$pattern[i], | |
801 x = data_new$PMD_FDR_proteins) | |
802 data_new$group_proteins[idx] <- df_group_def$group_name[i] | |
803 } | |
804 | |
805 data_new <- add_group_decoy(data_groups = data_new, field_name_to_group = "PMD_FDR_proteins") | |
806 return(data_new) | |
807 } | |
808 add_group_decoy <- function(data_groups=NULL, field_name_to_group=NULL){ | |
809 simple_field_name <- simplify_field_name(field_name_to_group) | |
810 field_name_decoy <- sprintf("group_decoy_%s", simple_field_name) | |
811 field_name_group <- sprintf("group_%s", simple_field_name) | |
812 | |
813 data_groups[[field_name_decoy]] <- with(data_groups, ifelse(PMD_FDR_decoy, "decoy", data_groups[[field_name_group]])) | |
814 | |
815 return(data_groups) | |
816 } | |
817 add_group_training_class <- function(data_groups = NULL){ | |
818 df_new <- data_groups | |
819 | |
820 lowest_confidence_group <- min(data_groups$group_input_score) | |
821 | |
822 is_long_enough <- with(df_new, (PMD_FDR_peptide_length >= MIN_GOOD_PEPTIDE_LENGTH) ) | |
823 is_good <- with(df_new, (PMD_FDR_decoy == 0) & (PMD_FDR_input_score == 100) ) | |
824 is_bad <- with(df_new, (PMD_FDR_decoy == 1) ) | |
825 #is_used_to_train <- with(df_new, used_to_find_middle) # BUGBUG: circular definition | |
826 | |
827 idx_good <- which(is_good ) # & is_long_enough) | |
828 n_good <- length(idx_good) | |
829 idx_testing <- idx_good[c(TRUE,FALSE)] # Selects every other item | |
830 idx_training <- setdiff(idx_good, idx_testing) | |
831 | |
832 #is_good_short <- with(df_new, is_good & !is_long_enough ) | |
833 #is_good_long <- with(df_new, is_good & is_long_enough ) | |
834 is_bad_short <- with(df_new, is_bad & !is_long_enough ) | |
835 is_bad_long <- with(df_new, is_bad & is_long_enough ) | |
836 #is_good_training <- with(df_new, is_good_long & (used_to_find_middle == TRUE ) ) | |
837 #is_good_testing <- with(df_new, is_good_long & (used_to_find_middle == FALSE) ) | |
838 | |
839 df_new$group_training_class <- "other_short" # Default | |
840 df_new$group_training_class[is_long_enough ] <- "other_long" # Default (if long enough) | |
841 df_new$group_training_class[idx_training ] <- "good_training" # Length does not matter (anymore) | |
842 df_new$group_training_class[idx_testing ] <- "good_testing" # Ditto | |
843 #df_new$group_training_class[is_good_short ] <- "good_short" | |
844 df_new$group_training_class[is_bad_long ] <- "bad_long" # ...except for "bad" | |
845 df_new$group_training_class[is_bad_short ] <- "bad_short" | |
846 | |
847 df_new <- add_used_to_find_middle( data_groups = df_new ) # Guarantees consistency between duplicated definitions | |
848 | |
849 return(df_new) | |
850 } | |
851 add_used_to_find_middle <- function(data_groups = NULL){ | |
852 df_new <- data_groups | |
853 idx_used <- which(data_groups$group_training_class == "good_training") | |
854 | |
855 df_new$used_to_find_middle <- FALSE | |
856 df_new$used_to_find_middle[idx_used] <- TRUE | |
857 | |
858 return(df_new) | |
859 } | |
860 add_group_spectrum_index <- function(data_groups = NULL){ | |
861 | |
862 # Supporting functions for add_group_spectrum_index() | |
863 | |
864 get_breaks_all <- function(df_new){ | |
865 # Supporting function(s) for get_breaks_all() | |
866 | |
867 get_cut_points <- function(data_subset){ | |
868 | |
869 # Supporting function(s) for get_cut_points() | |
870 | |
871 cut_values <- function(data=NULL, minimum_segment_length=NULL){ | |
872 # using cpt.mean -- Appears to have a memory leak | |
873 #results_cpt <- cpt.mean(data=data, method="PELT", minimum_segment_length=minimum_segment_length) | |
874 #results <- results_cpt@cpts | |
875 | |
876 # Just look at the end | |
877 #results <- c(length(data)) | |
878 | |
879 # regularly spaced, slightly larger than minimum_segment_length | |
880 n_points <- length(data) | |
881 n_regions <- floor(n_points / minimum_segment_length) | |
882 n_regions <- ifelse(n_regions == 0, 1, n_regions) | |
883 results <- round(seq(1, n_points, length.out = n_regions + 1)) | |
884 results <- results[-1] | |
885 return(results) | |
886 } | |
887 remove_last <- function(x){ | |
888 return(x[-length(x)] ) | |
889 } | |
890 | |
891 # Main code of for get_cut_points() | |
892 max_idx = max(data_subset$PMD_FDR_spectrum_index) | |
893 data_sub_sub <- subset(data_subset, group_training_class == "good_training") #(PMD_FDR_input_score==100) & (PMD_FDR_decoy==0)) | |
894 minimum_segment_length = 50 | |
895 | |
896 values <- data_sub_sub$value | |
897 n_values <- length(values) | |
898 local_to_global_idx <- data_sub_sub$PMD_FDR_spectrum_index | |
899 if (n_values <= minimum_segment_length){ | |
900 result <- c() | |
901 } else { | |
902 local_idx <- cut_values(data=values, minimum_segment_length=minimum_segment_length) | |
903 result <- local_to_global_idx[local_idx] | |
904 result <- remove_last(result) | |
905 } | |
906 result <- c(result, max_idx) | |
907 return(result) | |
908 } | |
909 remove_last <- function(vec) { | |
910 return(vec[-length(vec)]) | |
911 } | |
912 | |
913 # Main code of get_breaks_all() | |
914 | |
915 breaks <- 1 | |
916 | |
917 files <- unique(df_new$PMD_FDR_spectrum_file) | |
918 | |
919 for (local_file in files){ | |
920 data_subset <- subset(df_new, (PMD_FDR_spectrum_file==local_file)) | |
921 if (nrow(data_subset) > 0){ | |
922 breaks <- c(breaks, get_cut_points(data_subset)) | |
923 } | |
924 } | |
925 breaks <- sort(unique(breaks)) | |
926 breaks <- remove_last(breaks) | |
927 breaks <- c(breaks, max(df_new$PMD_FDR_spectrum_index + 1)) | |
928 | |
929 return(breaks) | |
930 } | |
931 | |
932 # Main code of add_group_spectrum_index() | |
933 | |
934 field_name_to_group <- "PMD_FDR_spectrum_index" | |
935 | |
936 df_new <- data_groups[order(data_groups[[field_name_to_group]]),] | |
937 breaks <- get_breaks_all(df_new) | |
938 | |
939 df_new$group_spectrum_index <- cut(x = df_new[[field_name_to_group]], breaks = breaks, right = FALSE, dig.lab = 6) | |
940 df_new <- add_group_decoy(data_groups = df_new, field_name_to_group = field_name_to_group) | |
941 | |
942 return(df_new) | |
943 } | |
944 add_median_of_group_index <-function(data_groups = NULL){ | |
945 field_median <- "median_of_group_index" | |
946 data_good <- subset(data_groups, used_to_find_middle ) | |
947 med <- aggregate(value~group_spectrum_index, data=data_good, FUN=safe_median) | |
948 med <- rename_column(med, "value", field_median) | |
949 | |
950 data_groups[[field_median]] <- NULL | |
951 df_new <- merge(data_groups, med) | |
952 | |
953 return(df_new) | |
954 } | |
955 add_1_percent_to_data_groups <- function(data_groups=NULL){ | |
956 | |
957 data_new <- data_groups | |
958 | |
959 if (get_raw_1_percent()$exists()){ | |
960 # Load 1 percent file | |
961 df_1_percent <- get_raw_1_percent()$df | |
962 | |
963 # Get relevant fields | |
964 df_1_percent$is_in_1percent <- TRUE | |
965 df_1_percent <- rename_column(df_1_percent, "Spectrum Title", "PMD_FDR_spectrum_title") | |
966 df_1_percent <- df_1_percent[,c("PMD_FDR_spectrum_title", "is_in_1percent")] | |
967 | |
968 # Merge with data_groups | |
969 data_new <- merge(data_new, df_1_percent, all.x=TRUE) | |
970 data_new$is_in_1percent[is.na(data_new$is_in_1percent)] <- FALSE | |
971 } | |
972 | |
973 # Save results | |
974 return(data_new) | |
975 | |
976 } | |
977 | |
978 | |
979 # Main code of make_data_groups() | |
980 data_groups <- standardize_fields(data_original) | |
981 | |
982 data_groups <- add_grouped_variable(field_name_to_group = "PMD_FDR_input_score", | |
983 data_groups = data_groups, | |
984 vec.length.out = 14, | |
985 vec.tolerance = 1, | |
986 value_format = "03d") | |
987 | |
988 data_groups <- add_grouped_variable(field_name_to_group = "PMD_FDR_pmd", | |
989 data_groups = data_groups, | |
990 vec.length.out = 21, | |
991 vec.tolerance = 0.1, | |
992 value_format = "+05.1f") | |
993 | |
994 data_groups <- add_grouped_variable(field_name_to_group = "PMD_FDR_peptide_length", | |
995 data_groups = data_groups, | |
996 vec.length.out = 11, | |
997 vec.tolerance = 1, | |
998 value_format = "02d") | |
999 | |
1000 # data_groups <- add_grouped_variable(field_name_to_group = "m_z", | |
1001 # data_groups = data_groups, | |
1002 # vec.length.out = 11, | |
1003 # vec.tolerance = 10, | |
1004 # value_format = "04.0f") | |
1005 # | |
1006 # data_groups <- add_grouped_variable(field_name_to_group = "measured_mass", | |
1007 # data_groups = data_groups, | |
1008 # vec.length.out = 11, | |
1009 # vec.tolerance = 1, | |
1010 # value_format = "04.0f") | |
1011 # | |
1012 # data_groups <- add_already_grouped_variable(field_name_to_group = "isotope_number", | |
1013 # data_groups = data_groups ) | |
1014 # | |
1015 # data_groups <- add_already_grouped_variable(field_name_to_group = "charge", | |
1016 # data_groups = data_groups ) | |
1017 # | |
1018 data_groups <- add_already_grouped_variable(field_name_to_group = "PMD_FDR_spectrum_file", | |
1019 data_groups = data_groups ) | |
1020 data_groups <- add_protein_group(data_groups = data_groups) | |
1021 data_groups <- add_group_training_class( data_groups = data_groups) | |
1022 data_groups <- add_group_spectrum_index( data_groups = data_groups) | |
1023 data_groups <- add_median_of_group_index( data_groups = data_groups) | |
1024 data_groups <- add_value_norm( data_groups = data_groups) | |
1025 | |
1026 # fields_of_interest <- c("PMD_FDR_input_score", "PMD_FDR_pmd", "m_z", "PMD_FDR_peptide_length", "isotope_number", "charge", "PMD_FDR_spectrum_file", "measured_mass", "PMD_FDR_spectrum_index", "PMD_FDR_proteins") | |
1027 # fields_of_interest <- c("value", | |
1028 # "PMD_FDR_decoy", | |
1029 # "PMD_FDR_spectrum_title", | |
1030 # "median_of_group_index", | |
1031 # "value_norm", | |
1032 # "used_to_find_middle", | |
1033 # "group_training_class", | |
1034 # fields_of_interest, | |
1035 # sprintf("group_%s" , fields_of_interest), | |
1036 # sprintf("group_decoy_%s", fields_of_interest)) | |
1037 | |
1038 fields_of_interest <- c("PMD_FDR_input_score", "PMD_FDR_pmd", "PMD_FDR_peptide_length", "PMD_FDR_spectrum_file", "PMD_FDR_spectrum_index", "PMD_FDR_proteins") | |
1039 fields_of_interest <- c("value", | |
1040 "PMD_FDR_decoy", | |
1041 "PMD_FDR_spectrum_title", | |
1042 "median_of_group_index", | |
1043 "value_norm", | |
1044 "used_to_find_middle", | |
1045 "group_training_class", | |
1046 fields_of_interest, | |
1047 sprintf("group_%s" , simplify_field_name(fields_of_interest)), | |
1048 sprintf("group_decoy_%s", simplify_field_name(fields_of_interest))) | |
1049 | |
1050 data_groups <- data_groups[,fields_of_interest] | |
1051 data_groups <- add_1_percent_to_data_groups(data_groups) | |
1052 | |
1053 return(data_groups) | |
1054 } | |
1055 | |
1056 data_original <- get_data_converter()$df #parents[[INDEX_OF_ORIGINAL_DATA]]$df | |
1057 df <<- make_data_groups(data_original) | |
1058 }, | |
1059 set_info = function(info){ | |
1060 parents[["info"]] <<- info | |
1061 }, | |
1062 get_info = function(){ | |
1063 return(verified_element_of_list(parents, "info", "Data_Object_Groupings$parents")) | |
1064 }, | |
1065 set_data_converter = function(data_converter){ | |
1066 parents[["data_converter"]] <<- data_converter | |
1067 }, | |
1068 get_data_converter = function(){ | |
1069 return(verified_element_of_list(parents, "data_converter", "Data_Object_Groupings$parents")) | |
1070 }, | |
1071 set_raw_1_percent = function(raw_1_percent){ ############## BUGBUG: the 1% file should be using the same file type format as the standard data (but isn't) | |
1072 parents[["raw_1_percent"]] <<- raw_1_percent | |
1073 }, | |
1074 get_raw_1_percent = function(){ | |
1075 return(verified_element_of_list(parents, "raw_1_percent", "Data_Object_Groupings$parents")) | |
1076 } | |
1077 ) | |
1078 ############################################################################### | |
1079 # Class: Data_Object_Individual_FDR | |
1080 ############################################################################### | |
1081 Data_Object_Individual_FDR <- setRefClass("Data_Object_Individual_FDR", | |
1082 contains = "Data_Object", | |
1083 fields =list(df = "data.frame")) | |
1084 Data_Object_Individual_FDR$methods( | |
1085 initialize = function(){ | |
1086 callSuper() | |
1087 class_name <<- "Data_Object_Individual_FDR" | |
1088 }, | |
1089 verify = function(){ | |
1090 data_groups = get_data_groups()$df | |
1091 densities = get_densities()$df | |
1092 alpha = get_alpha()$df | |
1093 | |
1094 check_field_name(data_groups, "data_groups", "value_norm") | |
1095 check_field_name(data_groups, "data_groups", "group_decoy_input_score") | |
1096 check_field_name(data_groups, "data_groups", "PMD_FDR_peptide_length") | |
1097 check_field_name(data_groups, "data_groups", "PMD_FDR_input_score") | |
1098 check_field_name(alpha, "alpha", "alpha") # BUGBUG: I'm missing a field here... | |
1099 check_field_name(densities, "densities", "x") | |
1100 check_field_name(densities, "densities", "t") | |
1101 check_field_name(densities, "densities", "f") | |
1102 | |
1103 }, | |
1104 set_data_groups = function(parent){ | |
1105 parents[["data_groups"]] <<- parent | |
1106 }, | |
1107 get_data_groups = function(){ | |
1108 return(verified_element_of_list(parents, "data_groups", "Data_Object_Individual_FDR$parents")) | |
1109 }, | |
1110 set_densities = function(parent){ | |
1111 parents[["densities"]] <<- parent | |
1112 }, | |
1113 get_densities = function(){ | |
1114 return(verified_element_of_list(parents, "densities", "Data_Object_Individual_FDR$parents")) | |
1115 }, | |
1116 set_alpha = function(parent){ | |
1117 parents[["alpha"]] <<- parent | |
1118 }, | |
1119 get_alpha = function(){ | |
1120 return(verified_element_of_list(parents, "alpha", "Data_Object_Individual_FDR$parents")) | |
1121 }, | |
1122 m_load_data = function(){ | |
1123 add_FDR_to_data_groups <- function(data_groups=NULL, densities=NULL, alpha=NULL, field_value=NULL, field_decoy_group=NULL, set_decoy_to_1=FALSE){ | |
1124 # Support functions for add_FDR_to_data_groups() | |
1125 get_group_fdr <- function(group_stats = NULL, data_groups = NULL, densities=NULL){ | |
1126 group_fdr <- apply(X = densities, MARGIN = 2, FUN = max) | |
1127 df_group_fdr <- data.frame(group_fdr) | |
1128 df_group_fdr <- rename_column(df_group_fdr, "group_fdr", "v") | |
1129 df_group_fdr$group_of_interest <- names(group_fdr) | |
1130 t <- df_group_fdr[df_group_fdr$group_of_interest == "t", "v"] | |
1131 f <- df_group_fdr[df_group_fdr$group_of_interest == "f", "v"] | |
1132 df_group_fdr <- subset(df_group_fdr, !(group_of_interest %in% c("x", "t", "f"))) | |
1133 df_group_fdr$group_fdr <-(df_group_fdr$v - t) / (f - t) | |
1134 | |
1135 return(df_group_fdr) | |
1136 } | |
1137 | |
1138 get_mode <- function(x){ | |
1139 d <- density(x) | |
1140 return(d$x[which.max(d$y)]) | |
1141 } | |
1142 | |
1143 # Main code for add_FDR_to_data_groups() | |
1144 | |
1145 # Set up analysis | |
1146 data_new <- data_groups | |
1147 data_new$value_of_interest <- data_new[,field_value] | |
1148 data_new$group_of_interest <- data_new[,field_decoy_group] | |
1149 | |
1150 data_subset <- subset(data_new, PMD_FDR_peptide_length >= 11) | |
1151 | |
1152 # Identify mean PMD_FDR_input_score per group | |
1153 | |
1154 group_input_score <- aggregate(PMD_FDR_input_score~group_of_interest, data=data_subset, FUN=mean) | |
1155 group_input_score <- rename_column(group_input_score, "PMD_FDR_input_score", "group_input_score") | |
1156 | |
1157 #group_fdr <- get_group_fdr(data_groups = data_subset, densities=densities) | |
1158 group_stats <- merge(alpha, group_input_score) | |
1159 group_stats <- subset(group_stats, group_of_interest != "PMD_FDR_decoy") | |
1160 | |
1161 x=c(0,group_stats$group_input_score) | |
1162 y=c(1,group_stats$alpha) | |
1163 FUN_interp <- approxfun(x=x,y=y) | |
1164 | |
1165 data_new$interpolated_groupwise_FDR <- FUN_interp(data_new$PMD_FDR_input_score) | |
1166 if (set_decoy_to_1){ | |
1167 data_new$interpolated_groupwise_FDR[data_new$PMD_FDR_decoy == 1] <- 1 | |
1168 } | |
1169 | |
1170 return(data_new) | |
1171 } | |
1172 | |
1173 data_groups = get_data_groups()$df | |
1174 densities = get_densities()$df | |
1175 alpha = get_alpha()$df | |
1176 | |
1177 d_true <- densities[,c("x", "t")] | |
1178 d_false <- densities[,c("x", "f")] | |
1179 | |
1180 i_fdr <- add_FDR_to_data_groups(data_groups = data_groups, | |
1181 densities = densities, | |
1182 alpha = alpha, | |
1183 field_value ="value_norm", | |
1184 field_decoy_group = "group_decoy_input_score") | |
1185 # Derive local t | |
1186 interp_t <- splinefun(x=d_true$x, y=d_true$t) #approxfun(x=d_true$x, y=d_true$y) | |
1187 | |
1188 # Derive local f | |
1189 interp_f <- splinefun(x=d_false$x, y=d_false$f) #approxfun(x=d_true$x, y=d_true$y) | |
1190 | |
1191 # Derive local FDR | |
1192 i_fdr$t <- interp_t(i_fdr$value_of_interest) | |
1193 i_fdr$f <- interp_f(i_fdr$value_of_interest) | |
1194 i_fdr$alpha <- i_fdr$interpolated_groupwise_FDR | |
1195 i_fdr$i_fdr <- with(i_fdr, (alpha*f) / (alpha*f + (1-alpha)*t)) | |
1196 | |
1197 df <<- i_fdr | |
1198 | |
1199 } | |
1200 ) | |
1201 ############################################################################### | |
1202 # Class: Data_Object_Densities | |
1203 ############################################################################### | |
1204 Data_Object_Densities <- setRefClass("Data_Object_Densities", | |
1205 contains = "Data_Object", | |
1206 fields =list(df = "data.frame")) | |
1207 Data_Object_Densities$methods( | |
1208 initialize = function(){ | |
1209 callSuper() | |
1210 class_name <<- "Data_Object_Densities" | |
1211 }, | |
1212 verify = function(){ | |
1213 df_data_groups <- get_data_groups()$df | |
1214 | |
1215 checkTrue(nrow(df_data_groups) > 0, | |
1216 msg = "data_groups data frame was empty (and should not have been)") | |
1217 | |
1218 check_field_name(df_data_groups, "data_groups", "value_norm") | |
1219 check_field_name(df_data_groups, "data_groups", "group_decoy_input_score") | |
1220 check_field_name(df_data_groups, "data_groups", "group_training_class") | |
1221 }, | |
1222 set_data_groups = function(parent=NULL){ | |
1223 parents[["data_groups"]] <<- parent | |
1224 }, | |
1225 get_data_groups = function(){ | |
1226 return(verified_element_of_list(parent_list = parents, element_name = "data_groups", object_name = "Data_Object_Densities$parents")) | |
1227 }, | |
1228 m_load_data = function(){ | |
1229 | |
1230 # Support functions for make_densities() | |
1231 set_values_of_interest <- function(df_data_groups){ | |
1232 field_value = "value_norm" | |
1233 field_decoy_group = "group_decoy_input_score" | |
1234 new_data_groups <- get_data_groups()$df | |
1235 new_data_groups$value_of_interest <- new_data_groups[,field_value] | |
1236 new_data_groups$group_of_interest <- new_data_groups[,field_decoy_group] | |
1237 #groups <- sort(unique(new_data_groups$group_of_interest)) | |
1238 | |
1239 return(new_data_groups) | |
1240 } | |
1241 get_ylim <- function(data_groups=NULL){ | |
1242 ylim <- range(data_groups$value_of_interest) | |
1243 | |
1244 return(ylim) | |
1245 } | |
1246 make_hit_density <- function(data_subset=NULL, ylim=NULL){ | |
1247 #stop("Data_Object_Densities$make_hit_density() is untested beyond here") | |
1248 uniformalize_density <- function(d){ | |
1249 # Reorganizes y-values of density function so that | |
1250 # function is monotone increasing to mode | |
1251 # and monotone decreasing afterwards | |
1252 idx_mode <- which.max(d$y) | |
1253 idx_lower <- 1:(idx_mode-1) | |
1254 idx_upper <- idx_mode:length(d$y) | |
1255 | |
1256 values_lower <- d$y[idx_lower] | |
1257 values_upper <- d$y[idx_upper] | |
1258 | |
1259 new_d <- d | |
1260 new_d$y <- c(sort(values_lower, decreasing = FALSE), | |
1261 sort(values_upper, decreasing = TRUE)) | |
1262 | |
1263 return(new_d) | |
1264 } | |
1265 | |
1266 MIN_PEPTIDE_LENGTH = 11 | |
1267 d <- with(subset(data_subset, | |
1268 (PMD_FDR_peptide_length >= MIN_PEPTIDE_LENGTH) & | |
1269 (used_to_find_middle == FALSE)), | |
1270 density(value_of_interest, | |
1271 from = ylim[1], | |
1272 to = ylim[2])) | |
1273 d <- normalize_density( d) | |
1274 d <- uniformalize_density(d) | |
1275 | |
1276 return(d) | |
1277 } | |
1278 make_true_hit_density <- function(data_groups=NULL){ | |
1279 #stop("Data_Object_Densities$make_true_hit_density() is untested beyond here") | |
1280 d_true <- make_hit_density(data_subset = subset(data_groups, (group_training_class == "good_testing") ), | |
1281 ylim = get_ylim(data_groups)) | |
1282 return(d_true) | |
1283 } | |
1284 make_false_hit_density <- function(data_groups=NULL){ | |
1285 d_false <- make_hit_density(data_subset = subset(data_groups, (group_training_class == "bad_long") ), | |
1286 ylim = get_ylim(data_groups)) | |
1287 | |
1288 return(d_false) | |
1289 } | |
1290 add_v_densities <- function(data_groups=NULL, densities=NULL){ | |
1291 #stop("Data_Object_Densities$add_v_densities() is untested beyond here") | |
1292 groups <- sort(unique(data_groups$group_of_interest)) | |
1293 | |
1294 new_densities <- densities | |
1295 | |
1296 for (local_group in groups){ | |
1297 d_v <- make_hit_density(data_subset = subset(data_groups, (group_of_interest == local_group)), | |
1298 ylim = get_ylim(data_groups)) | |
1299 new_densities[local_group] <- d_v$y | |
1300 } | |
1301 | |
1302 return(new_densities) | |
1303 } | |
1304 | |
1305 # Main section of make_densities() | |
1306 df_data_groups <- get_data_groups()$df | |
1307 new_data_groups <- set_values_of_interest(df_data_groups) | |
1308 d_true <- make_true_hit_density( new_data_groups) | |
1309 d_false <- make_false_hit_density(new_data_groups) | |
1310 | |
1311 densities <- data.frame(x=d_true$x, | |
1312 t=d_true$y, | |
1313 f=d_false$y) | |
1314 densities <- add_v_densities(data_groups=new_data_groups, densities=densities) | |
1315 df <<- densities | |
1316 } | |
1317 ) | |
1318 ############################################################################### | |
1319 # Class: Data_Object_Alpha | |
1320 ############################################################################### | |
1321 Data_Object_Alpha <- setRefClass("Data_Object_Alpha", | |
1322 contains = "Data_Object", | |
1323 fields =list(df = "data.frame")) | |
1324 Data_Object_Alpha$methods( | |
1325 initialize = function(){ | |
1326 callSuper() | |
1327 class_name <<- "Data_Object_Alpha" | |
1328 }, | |
1329 verify = function(){ | |
1330 densities <- get_densities()$df | |
1331 | |
1332 checkTrue(nrow(densities) > 0, | |
1333 msg = "Densities data.frame was empty (and should not have been)") | |
1334 }, | |
1335 set_densities = function(parent=NULL){ | |
1336 parents[["densities"]] <<- parent | |
1337 }, | |
1338 get_densities = function(){ | |
1339 return(verified_element_of_list(parent_list = parents, element_name = "densities", object_name = "Data_Object_Alpha")) | |
1340 }, | |
1341 m_load_data = function(){ | |
1342 | |
1343 densities <- get_densities()$df | |
1344 | |
1345 max_of_density = apply(X = densities, MARGIN = 2, FUN = max) | |
1346 df_alpha <- data.frame(stringsAsFactors = FALSE, | |
1347 list(v = max_of_density, | |
1348 group_of_interest = names(max_of_density))) | |
1349 df_alpha <- subset(df_alpha, group_of_interest != "x") | |
1350 t <- with(subset(df_alpha, group_of_interest=="t"), v) | |
1351 f <- with(subset(df_alpha, group_of_interest=="f"), v) | |
1352 df_alpha$alpha <- with(df_alpha, (t-v)/(t-f)) | |
1353 | |
1354 alpha <- df_alpha[,c("group_of_interest", "alpha")] | |
1355 alpha <- subset(alpha, (group_of_interest != "t") & (group_of_interest != "f")) | |
1356 | |
1357 df <<- alpha | |
1358 } | |
1359 ) | |
1360 ############################################################################### | |
1361 # Class: Data_Processor | |
1362 ############################################################################### | |
1363 Data_Processor <- setRefClass("Data_Processor", | |
1364 fields =list(info = "Data_Object_Info", | |
1365 raw_data = "Data_Object_Raw_Data", | |
1366 raw_1_percent = "Data_Object_Raw_1_Percent", | |
1367 data_converter = "Data_Object_Data_Converter", | |
1368 data_groups = "Data_Object_Groupings", | |
1369 densities = "Data_Object_Densities", | |
1370 alpha = "Data_Object_Alpha", | |
1371 i_fdr = "Data_Object_Individual_FDR")) | |
1372 Data_Processor$methods( | |
1373 initialize = function(p_info=NULL){ | |
1374 if (! is.null(p_info)){ | |
1375 set_info(p_info) | |
1376 } | |
1377 }, | |
1378 set_info = function(p_info=NULL){ | |
1379 # This initialization defines all of the dependencies between the various components | |
1380 | |
1381 info <<- p_info | |
1382 | |
1383 # raw_data | |
1384 raw_data$set_info(info) | |
1385 info$append_child(raw_data) | |
1386 | |
1387 # raw_1_percent | |
1388 raw_1_percent$set_info(info) | |
1389 info$append_child(raw_1_percent) | |
1390 | |
1391 # data_converter | |
1392 data_converter$set_info (info) | |
1393 data_converter$set_raw_data(raw_data) | |
1394 info $append_child (data_converter) | |
1395 raw_data $append_child (data_converter) | |
1396 | |
1397 # data_groups | |
1398 data_groups$set_info (info) | |
1399 data_groups$set_data_converter(data_converter) | |
1400 data_groups$set_raw_1_percent (raw_1_percent) | |
1401 info $append_child (data_groups) | |
1402 data_converter$append_child (data_groups) | |
1403 raw_1_percent $append_child (data_groups) | |
1404 | |
1405 # densities | |
1406 densities $set_data_groups(data_groups) | |
1407 data_groups$append_child (densities) | |
1408 | |
1409 # alpha | |
1410 alpha $set_densities(densities) | |
1411 densities$append_child (alpha) | |
1412 | |
1413 # i_fdr | |
1414 i_fdr$set_data_groups(data_groups) | |
1415 i_fdr$set_densities (densities) | |
1416 i_fdr$set_alpha (alpha) | |
1417 data_groups $append_child(i_fdr) | |
1418 densities $append_child(i_fdr) | |
1419 alpha $append_child(i_fdr) | |
1420 } | |
1421 ) | |
1422 | |
1423 | |
1424 ############################################################# | |
1425 ####### Classes for Plotting | |
1426 ############################################################# | |
1427 | |
1428 ############################################################################### | |
1429 # Class: Plot_Image | |
1430 ############################################################################### | |
1431 Plot_Image = setRefClass("Plot_Image", | |
1432 fields = list(data_processors = "list", | |
1433 plot_title = "character", | |
1434 include_text = "logical", | |
1435 include_main = "logical", | |
1436 x.intersp = "numeric", | |
1437 y.intersp = "numeric", | |
1438 scale = "numeric", | |
1439 main = "character", | |
1440 is_image_container = "logical")) | |
1441 Plot_Image$methods( | |
1442 initialize = function(p_data_processors = list(), | |
1443 p_include_main = TRUE, | |
1444 p_include_text = TRUE, | |
1445 p_is_image_container = FALSE){ | |
1446 include_main <<- p_include_main | |
1447 include_text <<- p_include_text | |
1448 data_processors <<- p_data_processors | |
1449 is_image_container <<- p_is_image_container | |
1450 }, | |
1451 plot_image = function(){ | |
1452 plot(main="Define plot_image() for subclass") # Abstract function | |
1453 }, | |
1454 get_n = function(){ | |
1455 stop("Need to define function get_n() for subclass") #Abstract function | |
1456 }, | |
1457 create_standard_main = function(){ | |
1458 needs_main <- function(){ | |
1459 return(include_text & include_main & !is_image_container) | |
1460 } | |
1461 if (needs_main()){ | |
1462 collection_name <- data_processors[[1]]$info$collection_name() | |
1463 main <<- sprintf("%s\n(Dataset: %s; n=%s)", plot_title, collection_name, format(get_n(), big.mark = ",")) | |
1464 } | |
1465 }, | |
1466 plot_image_in_window = function(p_scale=NULL, window_height=NULL, window_width=NULL){ | |
1467 scale <<- p_scale | |
1468 SIZE_AXIS <- 2.5 * scale # in the units used by mar | |
1469 SIZE_MAIN <- 2.5 * scale | |
1470 SIZE_NO_MARGIN <- 0.1 * scale | |
1471 FONT_SIZE <- 8 * scale | |
1472 WINDOW_WIDTH <- window_width * scale | |
1473 WINDOW_HEIGHT <- window_height * scale | |
1474 X_INTERSP <- 0.5 * scale + 0.4 # manages legend text spacing | |
1475 Y_INTERSP <- 0.5 * scale + 0.4 # manages | |
1476 | |
1477 if (include_main){ | |
1478 mar = c(SIZE_AXIS, SIZE_AXIS, SIZE_MAIN , SIZE_NO_MARGIN) | |
1479 } else { | |
1480 mar = c(SIZE_AXIS, SIZE_AXIS, SIZE_NO_MARGIN, SIZE_NO_MARGIN) | |
1481 } | |
1482 mgp = c(SIZE_AXIS/2, SIZE_AXIS/4, 0) # Margin line (mex units) for axis title, axis labels, axis lines | |
1483 ps = FONT_SIZE | |
1484 x.intersp <<- X_INTERSP | |
1485 y.intersp <<- Y_INTERSP | |
1486 | |
1487 windows(width = WINDOW_WIDTH, height=WINDOW_HEIGHT) | |
1488 | |
1489 old_par <- par(mar=mar, ps=ps, mgp=mgp) | |
1490 create_standard_main() | |
1491 | |
1492 plot_image() | |
1493 if (!is_image_container){ | |
1494 axis(side=1, labels=include_text, tcl=-0.5, lwd=scale) | |
1495 axis(side=2, labels=include_text, tcl=-0.5, lwd=scale) | |
1496 box(lwd=scale) | |
1497 } | |
1498 par(old_par) | |
1499 }, | |
1500 plot_image_in_small_window = function(p_scale=1){ | |
1501 plot_image_in_window(p_scale=p_scale, window_height=2, window_width=3.25) | |
1502 }, | |
1503 plot_image_in_large_window = function(p_scale=1, window_height=NULL){ | |
1504 plot_image_in_window(p_scale=p_scale, window_height=window_height, window_width=7) | |
1505 } | |
1506 ) | |
1507 ############################################################################### | |
1508 # Class: Legend_Object | |
1509 ############################################################################### | |
1510 Legend_Object = setRefClass("Legend_Object", | |
1511 contains = "Plot_Image", | |
1512 fields = list(user_params = "list", | |
1513 scale = "numeric")) | |
1514 Legend_Object$methods( | |
1515 initialize = function(p_user_params = NULL, p_scale = NULL){ | |
1516 if (is.null(p_user_params)){ | |
1517 user_params <<- list() | |
1518 } else { | |
1519 user_params <<- p_user_params | |
1520 } | |
1521 if (is.null(p_scale)){ | |
1522 stop("Legend_Object must have a valid scale") | |
1523 } else { | |
1524 scale <<- p_scale | |
1525 } | |
1526 user_params$x <<- if_null(user_params$x , "topleft", user_params$x) | |
1527 user_params$y <<- if_null(user_params$y , NULL, user_params$y) | |
1528 user_params$bty <<- if_null(user_params$bty , "o", user_params$bty) | |
1529 user_params$lwd <<- if_null(user_params$lwd , NULL, user_params$lwd * scale) # Because we allow NULL, scale must be inside parens | |
1530 user_params$seg.len <<- if_null(user_params$seg.len , 3, user_params$seg.len ) * scale | |
1531 user_params$box.lwd <<- if_null(user_params$box.lwd , 1, user_params$box.lwd ) * scale | |
1532 user_params$x.intersp <<- if_null(user_params$x.intersp, 0.6, user_params$x.intersp) * scale | |
1533 user_params$y.intersp <<- if_null(user_params$y.intersp, 0.4, user_params$y.intersp) * scale + 0.2 | |
1534 }, | |
1535 show = function(){ | |
1536 first_legend = legend(x = user_params$x, | |
1537 y = user_params$y, | |
1538 title = "", | |
1539 legend = user_params$leg, | |
1540 col = user_params$col, | |
1541 bty = user_params$bty, | |
1542 lty = user_params$lty, | |
1543 lwd = user_params$lwd, | |
1544 seg.len = user_params$seg.len, | |
1545 box.lwd = user_params$box.lwd, | |
1546 x.intersp = user_params$x.intersp, | |
1547 y.intersp = user_params$y.intersp) | |
1548 new_x = first_legend$rect$left | |
1549 new_y = first_legend$rect$top + first_legend$rect$h * ifelse(scale==1, 0.07, 0.03 - (scale * 0.02)) #switch(scale, 0.01, -0.01, -0.03, -0.05)# (0.07 - 0.09 * ((scale-1)^2))#(0.15 - 0.08*scale)#.07 * (2 - scale) | |
1550 legend(x=new_x, y=new_y, title = user_params$title, legend = "", cex=1.15, bty="n") | |
1551 | |
1552 } | |
1553 ) | |
1554 ############################################################################### | |
1555 # Class: Plot_Multiple_Images | |
1556 ############################################################################### | |
1557 Plot_Multiple_Images = setRefClass("Plot_Multiple_Images", | |
1558 contains = "Plot_Image", | |
1559 fields = list(n_images_wide = "numeric", | |
1560 n_images_tall = "numeric", | |
1561 image_list = "list")) | |
1562 Plot_Multiple_Images$methods( | |
1563 initialize = function(p_n_images_wide=1, p_n_images_tall=2, p_image_list=NULL, ...){ | |
1564 n_images_wide <<- p_n_images_wide | |
1565 n_images_tall <<- p_n_images_tall | |
1566 image_list <<- p_image_list | |
1567 #plot_title <<- "True Hit and False Hit Distributions" | |
1568 | |
1569 callSuper(p_is_image_container=TRUE, ...) | |
1570 }, | |
1571 plot_image = function(){ | |
1572 # Support functions | |
1573 apply_mtext <- function(letter=NULL){ | |
1574 line=1.3*scale | |
1575 mtext(letter, side=1, line=line, adj=0) | |
1576 } | |
1577 # main code | |
1578 old_par <- par(mfrow=c(n_images_tall, n_images_wide)) | |
1579 i=0 | |
1580 n_images <- length(image_list) | |
1581 | |
1582 for (i in 1:n_images){ | |
1583 image <- image_list[[i]] | |
1584 image$create_standard_main() | |
1585 image$scale <- scale | |
1586 image$plot_image() | |
1587 axis(side=1, labels=include_text, tcl=-0.5, lwd=scale) | |
1588 axis(side=2, labels=include_text, tcl=-0.5, lwd=scale) | |
1589 box(lwd=scale) | |
1590 apply_mtext(letter=sprintf("(%s)", letters[i])) | |
1591 | |
1592 } | |
1593 par(old_par) | |
1594 | |
1595 } | |
1596 ) | |
1597 ############################################################################### | |
1598 # Class: Plot_Compare_PMD_and_Norm_Density | |
1599 ############################################################################### | |
1600 Plot_Compare_PMD_and_Norm_Density = setRefClass("Plot_Compare_PMD_and_Norm_Density", | |
1601 contains = "Plot_Image", | |
1602 fields = list(show_norm = "logical", | |
1603 display_n_psms = "logical")) | |
1604 Plot_Compare_PMD_and_Norm_Density$methods( | |
1605 initialize = function(p_show_norm=TRUE, p_display_n_psms=TRUE, ...){ | |
1606 show_norm <<- p_show_norm | |
1607 display_n_psms <<- p_display_n_psms | |
1608 plot_title <<- "True Hit and False Hit Distributions" | |
1609 | |
1610 callSuper(...) | |
1611 }, | |
1612 plot_image = function(){ | |
1613 # Support functions for plot_compare_PMD_and_norm_density() | |
1614 | |
1615 get_densities <- function(data_subset = NULL, var_value = NULL){ | |
1616 data_subset$value_of_interest <- data_subset[,var_value] | |
1617 from <- min(data_subset$value_of_interest) | |
1618 to <- max(data_subset$value_of_interest) | |
1619 xlim = range(data_subset$value_of_interest) | |
1620 data_true <- subset(data_subset, (PMD_FDR_decoy==0) & (PMD_FDR_input_score==100)) | |
1621 data_false <- subset(data_subset, (PMD_FDR_decoy==1)) | |
1622 d_true <- with(data_true , density(value_of_interest, from = from, to = to)) | |
1623 d_false <- with(data_false, density(value_of_interest, from = from, to = to)) | |
1624 d_true <- normalize_density(d_true) | |
1625 d_false <- normalize_density(d_false) | |
1626 | |
1627 densities <- list(d_true=d_true, d_false=d_false, var_value = var_value, n_true = nrow(data_true), n_false = nrow(data_false)) | |
1628 | |
1629 return(densities) | |
1630 } | |
1631 get_xlim <- function(densities_a = NULL, densities_b = NULL, show_norm=NULL){ | |
1632 xlim <- range(c( densities_a$d_true$x, densities_a$d_false$y)) | |
1633 if (show_norm){ | |
1634 xlim <- range(c(xlim, densities_b$d_true$x, densities_b$d_false$y)) | |
1635 } | |
1636 return(xlim) | |
1637 } | |
1638 get_ylim <- function(densities_a = NULL, densities_b = NULL, show_norm=NULL){ | |
1639 ylim <- range(c( densities_a$d_true$y, densities_a$d_false$y)) | |
1640 if (show_norm){ | |
1641 ylim <- range(c(ylim, densities_b$d_true$y, densities_b$d_false$y)) | |
1642 } | |
1643 return(ylim) | |
1644 } | |
1645 plot_distributions <- function(densities = NULL, var_value= NULL, dataset_name = NULL, ...){ | |
1646 leg = list() | |
1647 leg$leg = c("Good", "Bad") | |
1648 if (display_n_psms){ | |
1649 leg$leg = sprintf("%s (%d PSMs)", | |
1650 leg$leg, | |
1651 c(densities$n_true, densities$n_false)) | |
1652 | |
1653 } | |
1654 leg$col = c("black", "red") | |
1655 leg$lwd = c(3 , 3) | |
1656 leg$lty = c(1 , 2) | |
1657 leg$title = "Hit Category" | |
1658 xlab = ifelse(var_value == "value", | |
1659 "PMD (ppm)", | |
1660 "PMD - normalized (ppm)") | |
1661 ylab = "Density" | |
1662 if (!include_text){ | |
1663 xlab = "" | |
1664 ylab = "" | |
1665 } | |
1666 plot( densities$d_true , col=leg$col[1], lwd=leg$lwd[1] * scale, lty=leg$lty[1], xaxt = "n", yaxt = "n", main=main, xlab = xlab, ylab=ylab, ...) | |
1667 lines(densities$d_false, col=leg$col[2], lwd=leg$lwd[2] * scale, lty=leg$lty[2]) | |
1668 abline(v=0, h=0, col="gray", lwd=1*scale) | |
1669 if (include_text){ | |
1670 legend_object <- Legend_Object$new(leg, scale) | |
1671 legend_object$show() | |
1672 #legend("topleft", legend=leg.leg, col=leg.col, lwd=leg.lwd, lty=leg.lty, x.intersp = x.intersp, y.intersp = y.intersp) | |
1673 } | |
1674 } | |
1675 | |
1676 # Main code block for plot_compare_PMD_and_norm_density | |
1677 data_processor <- data_processors[[1]] | |
1678 data_processor$data_groups$ensure() | |
1679 data_groups <- data_processor$data_groups$df | |
1680 | |
1681 data_subset_a <- subset(data_groups , used_to_find_middle == FALSE) | |
1682 data_subset_b <- subset(data_subset_a, PMD_FDR_peptide_length > 11) | |
1683 | |
1684 densities_a <- get_densities(data_subset = data_subset_a, var_value = "value") | |
1685 densities_b <- get_densities(data_subset = data_subset_b, var_value = "value_norm") | |
1686 | |
1687 xlim=get_xlim(densities_a, densities_b, show_norm = show_norm) | |
1688 ylim=get_ylim(densities_a, densities_b, show_norm = show_norm) | |
1689 | |
1690 dataset_name <- data_processor$info$collection_name | |
1691 plot_distributions( densities=densities_a, var_value = "value" , dataset_name = dataset_name, xlim=xlim, ylim=ylim) | |
1692 if (show_norm){ | |
1693 plot_distributions(densities=densities_b, var_value = "value_norm", dataset_name = dataset_name, xlim=xlim, ylim=ylim) | |
1694 } | |
1695 }, | |
1696 get_n = function(){ | |
1697 data_processor <- data_processors[[1]] | |
1698 data_processor$data_groups$ensure() | |
1699 data_subset_a <- subset(data_processor$data_groups$df , used_to_find_middle == FALSE) | |
1700 data_subset_b <- subset(data_subset_a, PMD_FDR_peptide_length > 11) | |
1701 | |
1702 if (show_norm){ | |
1703 data_subset <- data_subset_a | |
1704 } else { | |
1705 data_subset <- data_subset_b | |
1706 } | |
1707 | |
1708 data_true <- subset(data_subset, (PMD_FDR_decoy==0) & (PMD_FDR_input_score==100)) | |
1709 data_false <- subset(data_subset, (PMD_FDR_decoy==1)) | |
1710 | |
1711 return(nrow(data_true) + nrow(data_false)) | |
1712 } | |
1713 ) | |
1714 | |
1715 ############################################################################### | |
1716 # Class: Plot_Time_Invariance_Alt | |
1717 ############################################################################### | |
1718 Plot_Time_Invariance_Alt = setRefClass("Plot_Time_Invariance_Alt", | |
1719 contains = "Plot_Image", | |
1720 fields = list(show_norm = "logical", | |
1721 display_n_psms = "logical", | |
1722 training_class = "character", | |
1723 ylim = "numeric", | |
1724 field_of_interest = "character")) | |
1725 Plot_Time_Invariance_Alt$methods( | |
1726 initialize = function(p_ylim=NULL, p_training_class=NULL, p_field_of_interest="value_norm", ...){ | |
1727 get_subset_title <- function(training_class=NULL){ | |
1728 if (training_class == "bad_long"){ | |
1729 subset_title="bad only" | |
1730 } else if (training_class == "good_testing"){ | |
1731 subset_title="good-testing only" | |
1732 } else if (training_class == "good_training"){ | |
1733 subset_title="good-training only" | |
1734 } else if (training_class == "other"){ | |
1735 subset_title="other only" | |
1736 } else { | |
1737 stop("Unexpected training_class in plot_time_invariance") | |
1738 } | |
1739 return(subset_title) | |
1740 } | |
1741 | |
1742 ylim <<- p_ylim | |
1743 training_class <<- p_training_class | |
1744 field_of_interest <<- p_field_of_interest | |
1745 subset_title <- get_subset_title(training_class=training_class) | |
1746 backup_title <- sprintf("Middle 25%% PMD for spectra sorted by index%s", | |
1747 ifelse(is.null(subset_title), | |
1748 "", | |
1749 sprintf(" - %s", subset_title))) | |
1750 #plot_title <<- get_main(main_title=main, backup_title=backup_title, data_collection = data_collection) | |
1751 plot_title <<- backup_title | |
1752 | |
1753 callSuper(...) | |
1754 }, | |
1755 plot_image = function(){ | |
1756 # Support functions for plot_time_invariance() | |
1757 | |
1758 # Main code of plot_time_invariance() | |
1759 data_subset = get_data_subset() | |
1760 plot_group_spectrum_index_from_subset_boxes(data_subset = data_subset) | |
1761 abline(h=0, col="blue", lwd=scale) | |
1762 }, | |
1763 get_data_subset = function(){ | |
1764 data_processor <- data_processors[[1]] | |
1765 data_processor$data_groups$ensure() | |
1766 return(subset(data_processor$data_groups$df, (group_training_class==training_class))) | |
1767 }, | |
1768 get_n = function(){ | |
1769 return(nrow(get_data_subset())) | |
1770 }, | |
1771 plot_group_spectrum_index_from_subset_boxes = function(data_subset = NULL){ | |
1772 n_plot_groups <- 100 | |
1773 | |
1774 field_name_text <- ifelse(field_of_interest=="value", "PMD", "Translated PMD") | |
1775 new_subset <- data_subset | |
1776 new_subset$value_of_interest <- new_subset[,field_of_interest] | |
1777 new_subset <- new_subset[order(new_subset$PMD_FDR_spectrum_index),] | |
1778 | |
1779 idxs <- round_to_tolerance(seq(from=1, to=nrow(new_subset), length.out = n_plot_groups+1), 1) | |
1780 idxs_left <- idxs[-(n_plot_groups+1)] | |
1781 idxs_right <- idxs[-1] - 1 | |
1782 idxs_right[n_plot_groups] <- idxs_right[n_plot_groups] + 1 | |
1783 | |
1784 new_subset$plot_group <- NA | |
1785 for (i in 1:n_plot_groups){ | |
1786 new_subset$plot_group[idxs_left[i]:idxs_right[i]] <- i | |
1787 } | |
1788 xleft <- aggregate(PMD_FDR_spectrum_index ~plot_group, data=new_subset, FUN=min) | |
1789 xright <- aggregate(PMD_FDR_spectrum_index ~plot_group, data=new_subset, FUN=max) | |
1790 ybottom <- aggregate(value_of_interest~plot_group, data=new_subset, FUN=function(x){quantile(x, probs = 0.5 - (0.25/2))}) | |
1791 ytop <- aggregate(value_of_interest~plot_group, data=new_subset, FUN=function(x){quantile(x, probs = 0.5 + (0.25/2))}) | |
1792 boxes <- merge( rename_column(xleft , "PMD_FDR_spectrum_index" , "xleft"), | |
1793 merge( rename_column(xright , "PMD_FDR_spectrum_index" , "xright"), | |
1794 merge(rename_column(ybottom, "value_of_interest", "ybottom"), | |
1795 rename_column(ytop , "value_of_interest", "ytop")))) | |
1796 | |
1797 xlab <- "Spectrum Index" | |
1798 ylab <- sprintf("%s (ppm)", field_name_text ) | |
1799 if (is.null(ylim)){ | |
1800 ylim <<- range(new_subset$value_of_interest) | |
1801 } | |
1802 if (!include_text){ | |
1803 xlab="" | |
1804 ylab="" | |
1805 } | |
1806 plot(value_of_interest~PMD_FDR_spectrum_index, data=new_subset, type="n", ylim=ylim, xlab = xlab, ylab=ylab, main=main, xaxt="n", yaxt="n") | |
1807 with(boxes, rect(xleft = xleft, ybottom = ybottom, xright = xright, ytop = ytop, lwd=scale)) | |
1808 #points(median_of_group_index~PMD_FDR_spectrum_index, data=data_subset, cex=.5, pch=15) | |
1809 axis(1, labels=include_text, lwd=scale) | |
1810 axis(2, labels=include_text, lwd=scale) | |
1811 box(lwd=scale) #box around plot area | |
1812 } | |
1813 | |
1814 ) | |
1815 ############################################################################### | |
1816 # Class: Plot_Time_Invariance_Alt_Before_and_After | |
1817 ############################################################################### | |
1818 Plot_Time_Invariance_Alt_Before_and_After = setRefClass("Plot_Time_Invariance_Alt_Before_and_After", | |
1819 contains = "Plot_Multiple_Images", | |
1820 fields = list()) | |
1821 Plot_Time_Invariance_Alt_Before_and_After$methods( | |
1822 initialize = function(p_data_processors = NULL, | |
1823 p_include_text=TRUE, | |
1824 p_include_main=FALSE, | |
1825 p_ylim = c(-4,4), ...){ | |
1826 plot_object1 <- Plot_Time_Invariance_Alt$new(p_data_processors = p_data_processors, | |
1827 p_include_text=p_include_text, | |
1828 p_include_main=p_include_main, | |
1829 p_training_class = "good_testing", | |
1830 p_field_of_interest = "value", | |
1831 p_ylim = p_ylim) | |
1832 | |
1833 plot_object2 <- Plot_Time_Invariance_Alt$new(p_data_processors = p_data_processors, | |
1834 p_include_text=p_include_text, | |
1835 p_include_main=p_include_main, | |
1836 p_training_class = "good_testing", | |
1837 p_field_of_interest = "value_norm", | |
1838 p_ylim = p_ylim) | |
1839 | |
1840 callSuper(p_n_images_wide=1, | |
1841 p_n_images_tall=2, | |
1842 p_include_text=p_include_text, | |
1843 p_include_main=p_include_main, | |
1844 p_image_list = list(plot_object1, plot_object2), ...) | |
1845 } | |
1846 ) | |
1847 | |
1848 ############################################################################### | |
1849 # Class: Plot_Density_PMD_and_Norm_Decoy_by_AA_Length | |
1850 ############################################################################### | |
1851 Plot_Density_PMD_and_Norm_Decoy_by_AA_Length = setRefClass("Plot_Density_PMD_and_Norm_Decoy_by_AA_Length", | |
1852 contains = "Plot_Image", | |
1853 fields = list(show_norm = "logical")) | |
1854 Plot_Density_PMD_and_Norm_Decoy_by_AA_Length$methods( | |
1855 initialize = function(p_show_norm=FALSE, ...){ | |
1856 plot_title <<- "The Decoy Bump: PMD Distribution of Decoy matches by peptide length" | |
1857 show_norm <<- p_show_norm | |
1858 callSuper(...) | |
1859 }, | |
1860 get_n = function(){ | |
1861 data_processor <- data_processors[[1]] | |
1862 data_processor$data_groups$ensure() | |
1863 data_subset <- subset(data_processor$data_groups$df, (PMD_FDR_decoy == 1)) | |
1864 return(nrow(data_subset)) | |
1865 }, | |
1866 plot_image = function(){ | |
1867 | |
1868 # Support functions for plot_density_PMD_and_norm_decoy_by_aa_length() | |
1869 | |
1870 add_group_peptide_length_special <- function(){ | |
1871 data_processor <- data_processors[[1]] | |
1872 data_processor$data_groups$ensure() | |
1873 data_groups <- data_processor$data_groups$df # the name data_groups is a data.frame instead of a Data_Object | |
1874 data_groups <- subset(data_groups, used_to_find_middle == FALSE) | |
1875 | |
1876 df_group_definition <- data.frame(stringsAsFactors = FALSE, | |
1877 list(group_peptide_length_special = c("06-08", "09-10", "11-12", "13-15", "16-20", "21-50"), | |
1878 min = c( 6 , 9 , 11 , 13 , 16 , 21 ), | |
1879 max = c( 8 , 10 , 12 , 15 , 20 , 50 ) )) | |
1880 group_peptide_length_special <- data.frame(list(PMD_FDR_peptide_length = 6:50)) | |
1881 group_peptide_length_special$min <- with(group_peptide_length_special, sapply(PMD_FDR_peptide_length, FUN = function(i) max(df_group_definition$min[df_group_definition$min <= i]))) | |
1882 group_peptide_length_special <- merge(group_peptide_length_special, df_group_definition) | |
1883 | |
1884 data_groups$group_peptide_length_special <- NULL | |
1885 new_data_groups <- (merge(data_groups, | |
1886 group_peptide_length_special[,c("PMD_FDR_peptide_length", | |
1887 "group_peptide_length_special")])) | |
1888 return(new_data_groups) | |
1889 } | |
1890 get_densities <- function(data_subset = NULL, field_value = NULL, field_group=NULL){ | |
1891 get_density_from_subset <- function(data_subset=NULL, xlim=NULL){ | |
1892 | |
1893 d_group <- with(data_subset , density(value_of_interest, from = xlim[1], to = xlim[2])) | |
1894 d_group <- normalize_density(d_group) | |
1895 | |
1896 return(d_group) | |
1897 } | |
1898 | |
1899 data_temp <- data_subset | |
1900 data_temp$value_of_interest <- data_temp[[field_value]] | |
1901 data_temp$group_of_interest <- data_temp[[field_group]] | |
1902 | |
1903 xlim = range(data_temp$value_of_interest) | |
1904 | |
1905 groups <- sort(unique(data_temp$group_of_interest)) | |
1906 n_groups <- length(groups) | |
1907 | |
1908 d_group <- get_density_from_subset( data_subset=data_temp, xlim = xlim ) | |
1909 densities <- list("All decoys" = d_group) | |
1910 for (i in 1:n_groups){ | |
1911 group <- groups[i] | |
1912 | |
1913 d_group <- get_density_from_subset( data_subset=subset(data_temp, (group_of_interest == group)), | |
1914 xlim = xlim ) | |
1915 densities[[group]] <- d_group | |
1916 } | |
1917 | |
1918 return(densities) | |
1919 } | |
1920 get_limits <- function(densities_a = NULL, densities_b = NULL){ | |
1921 xlim = c() | |
1922 ylim = c(0) | |
1923 for (single_density in densities_a){ | |
1924 xlim=range(c(xlim, single_density$x)) | |
1925 ylim=range(c(ylim, single_density$y)) | |
1926 } | |
1927 for (single_density in densities_b){ | |
1928 xlim=range(c(xlim, single_density$x)) | |
1929 ylim=range(c(ylim, single_density$y)) | |
1930 } | |
1931 | |
1932 return(list(xlim=xlim, ylim=ylim)) | |
1933 } | |
1934 plot_distributions <- function(data_groups = NULL, xlim=NULL, ylim=NULL, densities = NULL, field_group= NULL, field_value = "value", xlab_modifier = "", var_value= NULL, include_peak_dots=TRUE, dataset_name = NULL, ...){ | |
1935 data_groups$group_of_interest <- data_groups[[field_group]] | |
1936 data_groups$value_of_interest <- data_groups[[field_value]] | |
1937 | |
1938 # Main body of plot_decoy_distribution_by_field_of_interest() | |
1939 FIXED_LWD=3 | |
1940 | |
1941 groups <- sort(unique(data_groups$group_of_interest)) | |
1942 n <- length(groups) | |
1943 | |
1944 df_leg <- data.frame(stringsAsFactors = FALSE, | |
1945 list(leg = groups, | |
1946 col = rainbow_with_fixed_intensity(n = n, goal_intensity_0_1 = 0.4), | |
1947 lty = rep(1:6, length.out=n), | |
1948 lwd = rep(FIXED_LWD , n)) ) | |
1949 | |
1950 d <- densities[["All decoys"]] | |
1951 | |
1952 xlab = sprintf("Precursor Mass Discrepancy%s (ppm)", xlab_modifier) | |
1953 ylab = "Density" | |
1954 | |
1955 if (!include_text){ | |
1956 xlab="" | |
1957 ylab="" | |
1958 } | |
1959 plot(d, lwd=FIXED_LWD * scale, main=main, xlab=xlab, ylab=ylab, xlim=xlim, ylim=ylim, xaxt="n", yaxt="n") | |
1960 | |
1961 ave_peak <- max(d$y) | |
1962 max_peak <- 0 | |
1963 | |
1964 for (local_group in groups){ | |
1965 data_subset <- subset(data_groups, group_of_interest == local_group) | |
1966 data_info <- subset(df_leg , leg == local_group) | |
1967 col <- data_info$col[1] | |
1968 lty <- data_info$lty[1] | |
1969 lwd <- data_info$lwd[1] | |
1970 if (nrow(data_subset) > 100){ | |
1971 d <- densities[[local_group]] #density(data_subset[[field_value]]) | |
1972 lines(d, col=col, lty=lty, lwd=lwd * scale) | |
1973 peak <- max(d$y) | |
1974 max_peak <- max(max_peak, peak) | |
1975 } | |
1976 } | |
1977 abline(v=0, h=0, lwd=scale) | |
1978 leg <- list(title = "Peptide length (aa)", | |
1979 leg = c("All decoys" , df_leg$leg), | |
1980 col = c(col2hex("black") , df_leg$col), | |
1981 lty = c(1 , df_leg$lty), | |
1982 lwd = c(FIXED_LWD , df_leg$lwd) | |
1983 ) | |
1984 if (include_text){ | |
1985 legend_object = Legend_Object$new(leg, scale) | |
1986 legend_object$show() | |
1987 #first_legend = legend(x="topleft", title = "", legend = leg$leg, col = leg$col, lty = leg$lty, lwd = leg$lwd, seg.len=leg$seg.len, box.lwd=leg$box.lwd, x.intersp = leg$x.intersp, y.intersp = leg$y.intersp) | |
1988 #new_x = first_legend$rect$left | |
1989 #new_y = first_legend$rect$top + first_legend$rect$h * .07 * (2 - scale) | |
1990 #legend(x=new_x, y=new_y, title = leg$title, legend = "", cex=1.15, bty="n") | |
1991 } | |
1992 | |
1993 box(lwd=scale) #box around plot area | |
1994 | |
1995 } | |
1996 | |
1997 # Main body for plot_density_PMD_and_norm_decoy_by_aa_length() | |
1998 | |
1999 data_mod <- add_group_peptide_length_special() | |
2000 data_mod <- subset(data_mod, PMD_FDR_decoy==1) | |
2001 | |
2002 densities_a <- get_densities(data_subset = data_mod, field_value = "value" , field_group = "group_peptide_length_special") | |
2003 densities_b <- get_densities(data_subset = data_mod, field_value = "value_norm", field_group = "group_peptide_length_special") | |
2004 | |
2005 data_processor <- data_processors[[1]] | |
2006 dataset_name <- data_processor$info$collection_name() | |
2007 | |
2008 limits <- get_limits(densities_a, densities_b) | |
2009 xlim <- limits$xlim | |
2010 ylim <- limits$ylim | |
2011 | |
2012 if (show_norm){ | |
2013 plot_distributions(data_groups = data_mod, densities=densities_b, field_value = "value_norm", xlab_modifier = " - normalized", field_group = "group_peptide_length_special", dataset_name=dataset_name, xlim=xlim, ylim=ylim) | |
2014 } else { | |
2015 plot_distributions(data_groups = data_mod, densities=densities_a, field_value = "value" , xlab_modifier = "" , field_group = "group_peptide_length_special", dataset_name=dataset_name, xlim=xlim, ylim=ylim) | |
2016 } | |
2017 } | |
2018 | |
2019 ) | |
2020 | |
2021 ############################################################################### | |
2022 # Class: Plot_Bad_CI | |
2023 ############################################################################### | |
2024 Plot_Bad_CI = setRefClass("Plot_Bad_CI", | |
2025 contains = "Plot_Image", | |
2026 fields = list(breaks = "numeric", | |
2027 ylim = "numeric")) | |
2028 Plot_Bad_CI$methods( | |
2029 initialize = function(p_breaks=20, p_ylim=NULL, ...){ | |
2030 if (is.null(p_ylim)){ | |
2031 ylim <<- numeric(0) | |
2032 } else { | |
2033 ylim <<- p_ylim | |
2034 } | |
2035 breaks <<- p_breaks | |
2036 plot_title <<- "Credible Intervals for proportion within range - bad" | |
2037 callSuper(...) | |
2038 }, | |
2039 data_processor = function(){ | |
2040 return(data_processors[[1]]) | |
2041 }, | |
2042 get_n = function(){ | |
2043 data_processor()$data_groups$ensure() | |
2044 return(nrow(subset(data_processor()$data_groups$df, (PMD_FDR_decoy == 1)))) | |
2045 }, | |
2046 plot_image = function(){ | |
2047 data_processor()$data_groups$ensure() | |
2048 data_groups <- data_processor()$data_groups$df | |
2049 data_decoy <- subset(data_groups, data_groups$group_training_class == "bad_long") | |
2050 data_decoy$region <- cut(x = data_decoy$value, breaks = breaks) | |
2051 table(data_decoy$region) | |
2052 regions <- unique(data_decoy$region) | |
2053 | |
2054 N = nrow(data_decoy) | |
2055 find_lower_ci_bound <- function(x){ | |
2056 ci <- credible_interval(length(x), N, precision = 0.001, alpha=0.05) | |
2057 return(ci[1]) | |
2058 } | |
2059 find_upper_ci_bound <- function(x){ | |
2060 ci <- credible_interval(length(x), N, precision = 0.001, alpha=0.05) | |
2061 return(ci[2]) | |
2062 } | |
2063 xleft <- aggregate(value~region, data=data_decoy, FUN=min) | |
2064 xright <- aggregate(value~region, data=data_decoy, FUN=max) | |
2065 ytop <- aggregate(value~region, data=data_decoy, FUN=find_upper_ci_bound) | |
2066 ybottom <- aggregate(value~region, data=data_decoy, FUN=find_lower_ci_bound) | |
2067 | |
2068 xleft <- rename_column(xleft , "value", "xleft" ) | |
2069 xright <- rename_column(xright , "value", "xright" ) | |
2070 ytop <- rename_column(ytop , "value", "ytop" ) | |
2071 ybottom <- rename_column(ybottom, "value", "ybottom") | |
2072 | |
2073 boxes <- merge(merge(xleft, xright), merge(ytop, ybottom)) | |
2074 | |
2075 | |
2076 xlab <- "Precursor Mass Discrepancy (ppm)" | |
2077 ylab <- "Proportion of PSMs\nin subgroup" | |
2078 xlim=range(data_decoy$value) | |
2079 get_ylim(boxes=boxes) | |
2080 if (!include_text){ | |
2081 xlab="" | |
2082 ylab="" | |
2083 } | |
2084 | |
2085 plot(x=c(-10,10), y=c(0,1), type="n", ylim=ylim, xlim=xlim, xlab=xlab, ylab=ylab, main=main, xaxt="n", yaxt="n") | |
2086 | |
2087 with(boxes, rect(xleft=xleft, xright=xright, ytop=ytop, ybottom=ybottom, lwd=scale)) | |
2088 | |
2089 abline(h=1/breaks, col="blue", lwd=scale) | |
2090 }, | |
2091 get_ylim = function(boxes=NULL){ | |
2092 is_valid_range <- function(r=NULL){ | |
2093 return(length(r) == 2) | |
2094 } | |
2095 if (! is_valid_range(ylim)){ | |
2096 ylim <<- range(c(0,boxes$ytop, boxes$ybottom)) | |
2097 } | |
2098 } | |
2099 | |
2100 ) | |
2101 ############################################################################### | |
2102 # Class: Plot_Selective_Loss | |
2103 ############################################################################### | |
2104 Plot_Selective_Loss = setRefClass("Plot_Selective_Loss", | |
2105 contains = "Plot_Image", | |
2106 fields = list()) | |
2107 Plot_Selective_Loss$methods( | |
2108 initialize = function( ...){ | |
2109 plot_title <<- "PMD-FDR Selectively removes Bad Hits" | |
2110 callSuper(...) | |
2111 }, | |
2112 data_processor = function(){ | |
2113 return(data_processors[[1]]) | |
2114 }, | |
2115 get_n = function(){ | |
2116 data_processor()$i_fdr$ensure() | |
2117 data_subset <- data_processor()$i_fdr$df | |
2118 return(nrow(data_subset)) | |
2119 }, | |
2120 plot_image = function(){ | |
2121 # Support functions for plot_selective_loss() | |
2122 | |
2123 samples_lost_by_threshold <- function(updated_i_fdr=NULL, score_threshold=NULL){ | |
2124 data_subset <- subset(updated_i_fdr, PMD_FDR_input_score >= score_threshold) | |
2125 tbl <- with(updated_i_fdr, | |
2126 table(PMD_FDR_input_score >= score_threshold, | |
2127 new_confidence < score_threshold, | |
2128 group_decoy_proteins)) | |
2129 df <- data.frame(tbl) | |
2130 df_n <- aggregate(Freq~group_decoy_proteins+Var1, data=df, FUN=sum) | |
2131 df_n <- rename_column(df_n, name_before = "Freq", "n") | |
2132 df <- merge(df, df_n) | |
2133 df$rate_of_loss <- with(df, Freq/n) | |
2134 df <- subset(df, (Var1==TRUE) & (Var2==TRUE)) | |
2135 df <- df[,c("group_decoy_proteins", "rate_of_loss", "n", "Freq")] | |
2136 if (nrow(df) > 0){ | |
2137 df$score_threshold <- score_threshold | |
2138 } | |
2139 return(df) | |
2140 } | |
2141 get_loss_record <- function(updated_i_fdr=NULL, score_thresholds=NULL){ | |
2142 df=data.frame() | |
2143 for (score_threshold in score_thresholds){ | |
2144 df_new_loss <- samples_lost_by_threshold(updated_i_fdr, score_threshold) | |
2145 df <- rbind(df, df_new_loss) | |
2146 } | |
2147 return(df) | |
2148 } | |
2149 | |
2150 # Main code for plot_selective_loss() | |
2151 | |
2152 updated_i_fdr <- data_processor()$i_fdr$df | |
2153 updated_i_fdr$new_confidence <- with(updated_i_fdr, 100 * (1-i_fdr)) #ifelse((1-i_fdr) < (PMD_FDR_input_score / 100), (1-i_fdr), (PMD_FDR_input_score/100))) | |
2154 loss_record <- get_loss_record(updated_i_fdr=updated_i_fdr, score_thresholds = 1:100) | |
2155 xlim <- with(loss_record, range(score_threshold)) | |
2156 ylim <- c(0,1) | |
2157 xlab <- "Fixed Confidence threshold (PeptideShaker score)" | |
2158 ylab <- "Rate of PSM disqualification from PMD-FDR" | |
2159 lwd <- 4 | |
2160 plot(x=xlim, y=ylim, type="n", main=main, xlab=xlab, ylab=ylab) | |
2161 | |
2162 groups <- sort(unique(loss_record$group_decoy_proteins)) | |
2163 n_g <- length(groups) | |
2164 | |
2165 cols <- rainbow_with_fixed_intensity(n=n_g, goal_intensity_0_1 = 0.5, alpha = 1) | |
2166 ltys <- rep(1:6, length.out=n_g) | |
2167 | |
2168 leg <- list(leg=groups, col=cols, lty=ltys, lwd=lwd, title="Species/Category") | |
2169 | |
2170 for (i in 1:n_g){ | |
2171 lines(rate_of_loss~score_threshold, data=subset(loss_record, group_decoy_proteins==leg$leg[i]), col=leg$col[i], lwd=leg$lwd * scale, lty=leg$lty[i]) | |
2172 } | |
2173 abline(h=0, v=100, lwd=scale) | |
2174 abline(h=c(0.1, 0.8), col="gray", lwd=scale) | |
2175 | |
2176 #leg = list(leg=group, col=col, lty=lty, lwd=lwd) | |
2177 #with(leg, legend(x = "topleft", legend = group, col = col, lty = lty, lwd = lwd, seg.len = seg.len)) | |
2178 legend_object <- Legend_Object$new(leg, scale) | |
2179 legend_object$show() | |
2180 } | |
2181 | |
2182 ) | |
2183 ############################################################################### | |
2184 # Class: Plot_Selective_Loss_for_TOC | |
2185 ############################################################################### | |
2186 Plot_Selective_Loss_for_TOC = setRefClass("Plot_Selective_Loss_for_TOC", | |
2187 contains = "Plot_Image", | |
2188 fields = list(xlab="character", | |
2189 ylab="character", | |
2190 title_x="numeric", | |
2191 title_y="numeric", | |
2192 legend_border="logical", | |
2193 legend_x = "numeric", | |
2194 legend_y = "numeric", | |
2195 legend_title="character", | |
2196 legend_location = "character", | |
2197 name_contaminant = "character", | |
2198 name_decoy = "character", | |
2199 name_human = "character", | |
2200 name_pyro = "character")) | |
2201 Plot_Selective_Loss_for_TOC$methods( | |
2202 initialize = function( ...){ | |
2203 plot_title <<- "PMD-FDR selectively removes bad hits" | |
2204 callSuper(...) | |
2205 xlab <<- "Confidence threshold (PeptideShaker)" | |
2206 ylab <<- "PMD Disqualifiction Rate" | |
2207 legend_border <<- FALSE | |
2208 #legend_title <<- "Species/Category" | |
2209 title_x <<- 50 | |
2210 title_y <<- 0.9 | |
2211 legend_x <<- 10 | |
2212 legend_y <<- 0.75 | |
2213 name_contaminant <<- "signal - contaminant" | |
2214 name_decoy <<- "decoy - reversed" | |
2215 name_human <<- "decoy - human" | |
2216 name_pyro <<- "signal - pyrococcus" | |
2217 }, | |
2218 data_processor = function(){ | |
2219 return(data_processors[[1]]) | |
2220 }, | |
2221 get_n = function(){ | |
2222 data_processor()$i_fdr$ensure() | |
2223 data_subset <- data_processor()$i_fdr$df | |
2224 return(nrow(data_subset)) | |
2225 }, | |
2226 plot_image = function(){ | |
2227 # Support functions for plot_selective_loss() | |
2228 | |
2229 samples_lost_by_threshold <- function(updated_i_fdr=NULL, score_threshold=NULL){ | |
2230 data_subset <- subset(updated_i_fdr, PMD_FDR_input_score >= score_threshold) | |
2231 tbl <- with(updated_i_fdr, | |
2232 table(PMD_FDR_input_score >= score_threshold, | |
2233 new_confidence < score_threshold, | |
2234 group_decoy_proteins)) | |
2235 df <- data.frame(tbl) | |
2236 df_n <- aggregate(Freq~group_decoy_proteins+Var1, data=df, FUN=sum) | |
2237 df_n <- rename_column(df_n, name_before = "Freq", "n") | |
2238 df <- merge(df, df_n) | |
2239 df$rate_of_loss <- with(df, Freq/n) | |
2240 df <- subset(df, (Var1==TRUE) & (Var2==TRUE)) | |
2241 df <- df[,c("group_decoy_proteins", "rate_of_loss", "n", "Freq")] | |
2242 if (nrow(df) > 0){ | |
2243 df$score_threshold <- score_threshold | |
2244 } | |
2245 return(df) | |
2246 } | |
2247 get_loss_record <- function(updated_i_fdr=NULL, score_thresholds=NULL){ | |
2248 df=data.frame() | |
2249 for (score_threshold in score_thresholds){ | |
2250 df_new_loss <- samples_lost_by_threshold(updated_i_fdr, score_threshold) | |
2251 df <- rbind(df, df_new_loss) | |
2252 } | |
2253 return(df) | |
2254 } | |
2255 convert_groups <- function(groups=NULL){ | |
2256 new_groups <- groups | |
2257 new_groups <- gsub(pattern = "contaminant", replacement = name_contaminant, x = new_groups) | |
2258 new_groups <- gsub(pattern = "decoy" , replacement = name_decoy , x = new_groups) | |
2259 new_groups <- gsub(pattern = "human" , replacement = name_human , x = new_groups) | |
2260 new_groups <- gsub(pattern = "pyrococcus" , replacement = name_pyro , x = new_groups) | |
2261 | |
2262 return(new_groups) | |
2263 } | |
2264 | |
2265 # Main code for plot_selective_loss() | |
2266 | |
2267 updated_i_fdr <- data_processor()$i_fdr$df | |
2268 updated_i_fdr$new_confidence <- with(updated_i_fdr, 100 * (1-i_fdr)) #ifelse((1-i_fdr) < (PMD_FDR_input_score / 100), (1-i_fdr), (PMD_FDR_input_score/100))) | |
2269 loss_record <- get_loss_record(updated_i_fdr=updated_i_fdr, score_thresholds = 1:100) | |
2270 xlim <- with(loss_record, range(score_threshold)) | |
2271 ylim <- c(0,1) | |
2272 #xlab <- "Fixed Confidence threshold (PeptideShaker score)" | |
2273 #ylab <- "Rate of PSM disqualification from PMD-FDR" | |
2274 lwd <- 4 | |
2275 plot(x=xlim, y=ylim, type="n", main=main, xlab=xlab, ylab=ylab) | |
2276 | |
2277 groups <- sort(unique(loss_record$group_decoy_proteins)) | |
2278 n_g <- length(groups) | |
2279 | |
2280 cols <- rainbow_with_fixed_intensity(n=n_g, goal_intensity_0_1 = 0.5, alpha = 1) | |
2281 ltys <- rep(1:6, length.out=n_g) | |
2282 bty <- ifelse(legend_border, "o", "n") | |
2283 | |
2284 leg <- list(leg=convert_groups(groups), var_name=groups, col=cols, lty=ltys, lwd=lwd, bty=bty, x=legend_x, y=legend_y) | |
2285 #leg <- list(leg=groups, col=cols, lty=ltys, lwd=lwd, bty=bty, x=legend_x, y=legend_y) | |
2286 | |
2287 for (i in 1:n_g){ | |
2288 lines(rate_of_loss~score_threshold, data=subset(loss_record, group_decoy_proteins==leg$var_name[i]), col=leg$col[i], lwd=leg$lwd * scale, lty=leg$lty[i]) | |
2289 } | |
2290 abline(h=0, v=100, lwd=scale) | |
2291 abline(h=c(0.1, 0.8), col="gray", lwd=scale) | |
2292 | |
2293 #leg = list(leg=group, col=col, lty=lty, lwd=lwd) | |
2294 #with(leg, legend(x = "topleft", legend = group, col = col, lty = lty, lwd = lwd, seg.len = seg.len)) | |
2295 legend_object <- Legend_Object$new(leg, scale) | |
2296 legend_object$show() | |
2297 text(x=title_x, y=title_y, labels = plot_title) | |
2298 } | |
2299 | |
2300 ) | |
2301 ############################################################################### | |
2302 # Class: Plot_Compare_iFDR_Confidence_1_Percent_TD_FDR | |
2303 ############################################################################### | |
2304 Plot_Compare_iFDR_Confidence_1_Percent_TD_FDR = setRefClass("Plot_Compare_iFDR_Confidence_1_Percent_TD_FDR", | |
2305 contains = "Plot_Image", | |
2306 fields = list()) | |
2307 Plot_Compare_iFDR_Confidence_1_Percent_TD_FDR$methods( | |
2308 initialize = function( ...){ | |
2309 plot_title <<- "Precursor Mass Discrepance i-FDR for 1% Target-Decoy FDR PSMs" | |
2310 callSuper(...) | |
2311 }, | |
2312 data_processor = function(){ | |
2313 return(data_processors[[1]]) | |
2314 }, | |
2315 get_n = function(){ | |
2316 data_processor()$i_fdr$ensure() | |
2317 if (one_percent_calculation_exists()){ | |
2318 i_fdr <- data_processor()$i_fdr$df | |
2319 data_subset <- subset(i_fdr, is_in_1percent==TRUE) | |
2320 n <- nrow(data_subset) | |
2321 } else { | |
2322 n <- 0 | |
2323 } | |
2324 | |
2325 return (n) | |
2326 }, | |
2327 plot_image = function(){ | |
2328 if (one_percent_calculation_exists()){ | |
2329 i_fdr <- get_modified_fdr() | |
2330 report_good_discrepancies(i_fdr) | |
2331 data_TD_good <- get_data_TD_good(i_fdr) | |
2332 mean_results <- get_mean_results(data_TD_good) | |
2333 boxes <- mean_results | |
2334 boxes <- rename_columns(df = boxes, | |
2335 names_before = c("min_conf", "max_conf", "lower" , "upper"), | |
2336 names_after = c("xleft" , "xright" , "ybottom", "ytop" )) | |
2337 xlim <- range(boxes[,c("xleft", "xright")]) | |
2338 ylim <- range(boxes[,c("ybottom", "ytop")]) | |
2339 | |
2340 #head(mean_results) | |
2341 | |
2342 xlab = "Confidence Score (Peptide Shaker)" | |
2343 ylab = "Mean PMD i-FDR" | |
2344 | |
2345 if (!include_text){ | |
2346 xlab="" | |
2347 ylab="" | |
2348 } | |
2349 | |
2350 plot(mean_i_fdr~mean_conf, data=mean_results, xlim=xlim, ylim=ylim, xlab=xlab, ylab=ylab, main=main, xaxt="n", yaxt="n", cex=scale, lwd=scale) | |
2351 with(boxes, rect(xleft = xleft, ybottom = ybottom, xright = xright, ytop = ytop, lwd=scale)) | |
2352 abline(b=-1, a=100, lwd=4*scale, col="dark gray") | |
2353 abline(h=0, v=100, lwd=1*scale) | |
2354 | |
2355 } else { | |
2356 stop(sprintf("Dataset '%s' does not include 1%% FDR data", data_processor()$info$collection_name())) | |
2357 } | |
2358 }, | |
2359 get_mean_results = function(data_TD_good = NULL){ | |
2360 mean_i_fdr <- aggregate(i_fdr~conf_group, data=data_TD_good, FUN=mean) | |
2361 mean_i_fdr <- rename_column(mean_i_fdr, "i_fdr", "mean_i_fdr") | |
2362 sd_i_fdr <- aggregate(i_fdr~conf_group, data=data_TD_good, FUN=sd) | |
2363 sd_i_fdr <- rename_column(sd_i_fdr, "i_fdr", "sd_i_fdr") | |
2364 n_i_fdr <- aggregate(i_fdr~conf_group, data=data_TD_good, FUN=length) | |
2365 n_i_fdr <- rename_column(n_i_fdr, "i_fdr", "n") | |
2366 mean_conf <- aggregate(PMD_FDR_input_score~conf_group, data=data_TD_good, FUN=mean) | |
2367 mean_conf <- rename_column(mean_conf, "PMD_FDR_input_score", "mean_conf") | |
2368 min_conf <- aggregate(PMD_FDR_input_score~conf_group, data=data_TD_good, FUN=min) | |
2369 min_conf <- rename_column(min_conf, "PMD_FDR_input_score", "min_conf") | |
2370 max_conf <- aggregate(PMD_FDR_input_score~conf_group, data=data_TD_good, FUN=max) | |
2371 max_conf <- rename_column(max_conf, "PMD_FDR_input_score", "max_conf") | |
2372 | |
2373 mean_results <- mean_i_fdr | |
2374 mean_results <- merge(mean_results, sd_i_fdr) | |
2375 mean_results <- merge(mean_results, n_i_fdr) | |
2376 mean_results <- merge(mean_results, mean_conf) | |
2377 mean_results <- merge(mean_results, min_conf) | |
2378 mean_results <- merge(mean_results, max_conf) | |
2379 | |
2380 mean_results$se <- with(mean_results, sd_i_fdr / sqrt(n - 1)) | |
2381 mean_results$lower <- with(mean_results, mean_i_fdr - 2*se) | |
2382 mean_results$upper <- with(mean_results, mean_i_fdr + 2*se) | |
2383 return(mean_results) | |
2384 }, | |
2385 get_data_TD_good = function(i_fdr=NULL){ | |
2386 data_TD_good <- subset(i_fdr, TD_good==TRUE) | |
2387 data_TD_good <- data_TD_good[order(data_TD_good$PMD_FDR_input_score),] | |
2388 n <- nrow(data_TD_good) | |
2389 data_TD_good$conf_group <- cut(1:n, breaks=floor(n/100)) | |
2390 data_TD_good$i_fdr <- 100 * data_TD_good$i_fdr | |
2391 return(data_TD_good) | |
2392 }, | |
2393 get_modified_fdr = function(){ | |
2394 i_fdr <- data_processor()$i_fdr$df | |
2395 i_fdr$PMD_good <- i_fdr$i_fdr < 0.01 | |
2396 i_fdr$TD_good <- i_fdr$is_in_1percent == TRUE | |
2397 i_fdr$conf_good <- i_fdr$PMD_FDR_input_score == 100 | |
2398 return(i_fdr) | |
2399 }, | |
2400 one_percent_calculation_exists = function(){ | |
2401 data_processor()$raw_1_percent$ensure() | |
2402 return(data_processor()$raw_1_percent$exists())# "is_in_1percent" %in% colnames(data_processor()$i_fdr)) | |
2403 }, | |
2404 report_good_discrepancies = function(i_fdr=NULL){ | |
2405 with(subset(i_fdr, (PMD_FDR_decoy == 0)), print(table(TD_good, PMD_good))) | |
2406 with(subset(i_fdr, (PMD_FDR_input_score==100) & (PMD_FDR_decoy == 0)), print(table(TD_good, PMD_good))) | |
2407 with(subset(i_fdr, (PMD_FDR_input_score>= 99) & (PMD_FDR_input_score<100) & (PMD_FDR_decoy == 0)), print(table(TD_good, PMD_good))) | |
2408 with(subset(i_fdr, (PMD_FDR_input_score>= 99) & (PMD_FDR_input_score<100) & (PMD_FDR_decoy == 0)), print(table(TD_good, PMD_good))) | |
2409 with(subset(i_fdr, (PMD_FDR_input_score>= 90) & (PMD_FDR_input_score< 99) & (PMD_FDR_decoy == 0)), print(table(TD_good, PMD_good))) | |
2410 } | |
2411 | |
2412 ) | |
2413 | |
2414 ############################################################################### | |
2415 # Class: Plot_Density_PMD_by_Score | |
2416 ############################################################################### | |
2417 Plot_Density_PMD_by_Score = setRefClass("Plot_Density_PMD_by_Score", | |
2418 contains = "Plot_Image", | |
2419 fields = list(show_norm = "logical")) | |
2420 Plot_Density_PMD_by_Score$methods( | |
2421 initialize = function(p_show_norm=FALSE, ...){ | |
2422 show_norm <<- p_show_norm | |
2423 plot_title <<- "PMD distribution, by Confidence ranges" | |
2424 callSuper(...) | |
2425 | |
2426 }, | |
2427 data_processor = function(){ | |
2428 return(data_processors[[1]]) | |
2429 }, | |
2430 get_n = function(){ | |
2431 return(nrow(data_processor()$data_groups$df)) | |
2432 #data_subset <- data_collection$i_fdr | |
2433 #return(nrow(data_subset)) | |
2434 }, | |
2435 get_modified_data_groups = function(var_value = NULL){ | |
2436 # Note: Filters out used_to_find_middle | |
2437 # Note: Creates "value_of_interest" field | |
2438 # Note: Remakes "group_decoy_input_score" field | |
2439 data_new <- data_processor()$data_groups$df | |
2440 data_new <- subset(data_new, !used_to_find_middle ) | |
2441 data_new$value_of_interest <- data_new[, var_value] | |
2442 | |
2443 cutoff_points <- c(100, 100, 95, 80, 50, 0, 0) | |
2444 n <- length(cutoff_points) | |
2445 uppers <- cutoff_points[-n] | |
2446 lowers <- cutoff_points[-1] | |
2447 | |
2448 for (i in 1:(n-1)){ | |
2449 upper <- uppers[i] | |
2450 lower <- lowers[i] | |
2451 | |
2452 | |
2453 if (lower==upper){ | |
2454 idx <- with(data_new, which( (PMD_FDR_input_score == upper) & (PMD_FDR_decoy == 0))) | |
2455 cat_name <- sprintf("%d", upper) | |
2456 } else { | |
2457 idx <- with(data_new, which((PMD_FDR_input_score >= lower) & (PMD_FDR_input_score < upper) & (PMD_FDR_decoy == 0))) | |
2458 cat_name <- sprintf("%02d - %2d", lower, upper) | |
2459 } | |
2460 data_new$group_decoy_input_score[idx] <- cat_name | |
2461 } | |
2462 | |
2463 return(data_new) | |
2464 }, | |
2465 plot_image = function(){ | |
2466 | |
2467 # Support functions for plot_density_PMD_by_score() | |
2468 | |
2469 get_densities <- function(data_subset = NULL, var_value = NULL){ | |
2470 | |
2471 # Support functions for get_densities() | |
2472 | |
2473 # New version | |
2474 | |
2475 # Main body of get_densities() | |
2476 | |
2477 data_subset <- get_modified_data_groups(var_value=var_value) | |
2478 #data_subset$value_of_interest <- data_subset[,var_value] | |
2479 from <- min(data_subset$value_of_interest) | |
2480 to <- max(data_subset$value_of_interest) | |
2481 xlim = range(data_subset$value_of_interest) | |
2482 | |
2483 groups <- sort(unique(data_subset$group_decoy_input_score), decreasing = TRUE) | |
2484 n_groups <- length(groups) | |
2485 | |
2486 densities <- list(var_value = var_value, groups=groups) | |
2487 | |
2488 for (i in 1:n_groups){ | |
2489 group <- groups[i] | |
2490 | |
2491 data_group_single <- subset(data_subset, (group_decoy_input_score == group)) | |
2492 d_group <- with(data_group_single , density(value_of_interest, from = from, to = to)) | |
2493 d_group <- normalize_density(d_group) | |
2494 | |
2495 densities[[group]] <- d_group | |
2496 } | |
2497 | |
2498 return(densities) | |
2499 | |
2500 } | |
2501 get_xlim <- function(densities_a = NULL, densities_b = NULL){ | |
2502 groups <- densities_a$groups | |
2503 | |
2504 xlim <- 0 | |
2505 for (group in groups){ | |
2506 xlim <- range(xlim, densities_a[[group]]$x, densities_b[[group]]$x) | |
2507 } | |
2508 | |
2509 return(xlim) | |
2510 | |
2511 } | |
2512 get_ylim <- function(densities_a = NULL, densities_b = NULL){ | |
2513 groups <- densities_a$groups | |
2514 | |
2515 ylim <- 0 | |
2516 for (group in groups){ | |
2517 ylim <- range(ylim, densities_a[[group]]$y, densities_b[[group]]$y) | |
2518 } | |
2519 | |
2520 return(ylim) | |
2521 | |
2522 } | |
2523 plot_distributions <- function(densities = NULL, var_value= NULL,include_peak_dots=TRUE, xlab_modifier="", xlim=NULL, ylim=NULL, ...){ | |
2524 data_groups <- get_modified_data_groups(var_value=var_value) | |
2525 groups <- sort(unique(data_groups$group_decoy_input_score)) | |
2526 n_groups <- length(groups) | |
2527 | |
2528 groups_std <- setdiff(groups, c("100", "decoy", "0") ) | |
2529 groups_std <- sort(groups_std, decreasing = TRUE) | |
2530 groups_std <- c(groups_std, "0") | |
2531 n_std <- length(groups_std) | |
2532 cols <- rainbow_with_fixed_intensity(n = n_std, goal_intensity_0_1 = 0.5, alpha=0.5) | |
2533 | |
2534 leg <- list(group = c("100" , groups_std , "decoy" ), | |
2535 leg = c("100" , groups_std , "All Decoys" ), | |
2536 col = c(col2hex("black") , cols , col2hex("purple", col_alpha = 0.5)), | |
2537 lwd = c(4 , rep(2, n_std), 4 ), | |
2538 title = "Confidence Score") | |
2539 | |
2540 xlab = sprintf("Precursor Mass Discrepancy%s (ppm)", | |
2541 xlab_modifier) | |
2542 ylab = "Density" | |
2543 if (!include_text){ | |
2544 xlab="" | |
2545 ylab="" | |
2546 } | |
2547 | |
2548 | |
2549 plot( x=xlim, y=ylim, col=leg$col[1], lwd=leg$lwd[1] * scale, main=main, xlab=xlab, ylab=ylab, xaxt="n", yaxt="n", cex=scale, type="n")#, lty=leg.lty[1], ...) | |
2550 | |
2551 include_peak_dots = FALSE # BUGBUG: Disabling this for now. Need to move this to class parameter | |
2552 | |
2553 for (i in 1:length(leg$group)){ | |
2554 group <- leg$group[i] | |
2555 d <- densities[[group]] | |
2556 lines(d, col=leg$col[i], lwd=leg$lwd[i] * scale) | |
2557 if (include_peak_dots){ | |
2558 x=d$x[which.max(d$y)] | |
2559 y=max(d$y) | |
2560 points(x=c(x,x), y=c(0,y), pch=19, col=leg$col[i], cex=scale) | |
2561 } | |
2562 } | |
2563 | |
2564 abline(v=0, lwd=scale) | |
2565 | |
2566 if (include_text){ | |
2567 legend_object = Legend_Object$new(leg, scale) | |
2568 legend_object$show() | |
2569 } | |
2570 | |
2571 } | |
2572 | |
2573 # Main body for plot_density_PMD_by_score() | |
2574 | |
2575 data_groups <- data_processor()$data_groups$df | |
2576 | |
2577 data_subset_a <- subset(data_groups , used_to_find_middle == FALSE) | |
2578 data_subset_b <- subset(data_subset_a, PMD_FDR_peptide_length > 11) | |
2579 | |
2580 densities_a <- get_densities(data_subset = data_subset_a, var_value = "value") | |
2581 densities_b <- get_densities(data_subset = data_subset_b, var_value = "value_norm") | |
2582 | |
2583 xlim=get_xlim(densities_a, densities_b) | |
2584 ylim=get_ylim(densities_a, densities_b) | |
2585 | |
2586 dataset_name <- data_processor()$info$collection_name() | |
2587 if (show_norm){ | |
2588 plot_distributions(densities=densities_b, var_value = "value_norm", xlab_modifier = " - normalized", xlim=xlim, ylim=ylim) | |
2589 } else { | |
2590 plot_distributions(densities=densities_a, var_value = "value" , xlab_modifier = "" , xlim=xlim, ylim=ylim) | |
2591 } | |
2592 } | |
2593 ) | |
2594 ############################################################################### | |
2595 # Class: Plot_Dataset_Description | |
2596 ############################################################################### | |
2597 Plot_Dataset_Description = setRefClass("Plot_Dataset_Description", | |
2598 contains = "Plot_Multiple_Images", | |
2599 fields = list(ylim_time_invariance = "numeric")) | |
2600 Plot_Dataset_Description$methods( | |
2601 initialize = function(p_data_processors = NULL, | |
2602 p_include_text=TRUE, | |
2603 p_include_main=FALSE, | |
2604 p_ylim_time_invariance = c(-4,4), ...){ | |
2605 plot_object_r1_c1 <- Plot_Time_Invariance_Alt$new(p_data_processors=p_data_processors, | |
2606 p_include_text=p_include_text, | |
2607 p_include_main=p_include_main, | |
2608 p_training_class = "good_testing", | |
2609 p_field_of_interest = "value", | |
2610 p_ylim = p_ylim_time_invariance) | |
2611 | |
2612 plot_object_r1_c2 <- Plot_Time_Invariance_Alt$new(p_data_processors=p_data_processors, | |
2613 p_include_text=p_include_text, | |
2614 p_include_main=p_include_main, | |
2615 p_training_class = "good_testing", | |
2616 p_field_of_interest = "value_norm", | |
2617 p_ylim = p_ylim_time_invariance) | |
2618 plot_object_r2_c1 <- Plot_Density_PMD_by_Score$new(p_data_processors=p_data_processors, | |
2619 p_show_norm=FALSE, | |
2620 p_include_text=p_include_text, | |
2621 p_include_main=p_include_main) | |
2622 | |
2623 plot_object_r2_c2 <- Plot_Density_PMD_and_Norm_Decoy_by_AA_Length$new(p_data_processors=p_data_processors, | |
2624 p_show_norm=FALSE, | |
2625 p_include_text=p_include_text, | |
2626 p_include_main=p_include_main) | |
2627 | |
2628 plot_object_r3_c1 <- Plot_Density_PMD_by_Score$new(p_data_processors=p_data_processors, | |
2629 p_show_norm=TRUE, | |
2630 p_include_text=p_include_text, | |
2631 p_include_main=p_include_main) | |
2632 plot_object_r3_c2 <- Plot_Density_PMD_and_Norm_Decoy_by_AA_Length$new(p_data_processors=p_data_processors, | |
2633 p_show_norm=TRUE, | |
2634 p_include_text=p_include_text, | |
2635 p_include_main=p_include_main) | |
2636 callSuper(p_n_images_wide=2, | |
2637 p_n_images_tall=3, | |
2638 p_include_text=p_include_text, | |
2639 p_include_main=p_include_main, | |
2640 p_image_list = list(plot_object_r1_c1, plot_object_r1_c2, | |
2641 plot_object_r2_c1, plot_object_r2_c2, | |
2642 plot_object_r3_c1, plot_object_r3_c2), ...) | |
2643 | |
2644 } | |
2645 ) | |
2646 ############################################################################### | |
2647 # Class: Plots_for_Paper | |
2648 ############################################################################### | |
2649 Plots_for_Paper <- setRefClass("Plots_for_Paper", fields =list(data_processor_a = "Data_Processor", | |
2650 data_processor_b = "Data_Processor", | |
2651 data_processor_c = "Data_Processor", | |
2652 data_processor_d = "Data_Processor", | |
2653 include_text = "logical", | |
2654 include_main = "logical", | |
2655 mai = "numeric")) | |
2656 Plots_for_Paper$methods( | |
2657 initialize = function(){ | |
2658 data_processor_a <<- Data_Processor$new(p_info = Data_Object_Info_737_two_step$new()) | |
2659 data_processor_b <<- Data_Processor$new(p_info = Data_Object_Info_737_combined$new()) | |
2660 data_processor_c <<- Data_Processor$new(p_info = Data_Object_Pyrococcus_tr $new()) | |
2661 data_processor_d <<- Data_Processor$new(p_info = Data_Object_Mouse_Mutations $new()) | |
2662 }, | |
2663 create_plots_for_paper = function(include_main=TRUE, finalize=TRUE){ | |
2664 print_table_4_data() | |
2665 print_figure_2_data() | |
2666 plot_figure_D(p_scale=ifelse(finalize, 2, 1), p_include_main = include_main) | |
2667 plot_figure_C(p_scale=ifelse(finalize, 2, 1), p_include_main = include_main) | |
2668 plot_figure_B(p_scale=ifelse(finalize, 2, 1), p_include_main = include_main) | |
2669 plot_figure_A(p_scale=ifelse(finalize, 2, 1), p_include_main = include_main) | |
2670 plot_figure_8(p_scale=ifelse(finalize, 2, 1), p_include_main = include_main) | |
2671 plot_figure_7(p_scale=ifelse(finalize, 2, 1), p_include_main = include_main) | |
2672 plot_figure_6(p_scale=ifelse(finalize, 4, 1), p_include_main = include_main) | |
2673 plot_figure_5(p_scale=ifelse(finalize, 2, 1), p_include_main = include_main) | |
2674 plot_figure_4(p_scale=ifelse(finalize, 2, 1), p_include_main = include_main) | |
2675 plot_figure_3(p_scale=ifelse(finalize, 4, 1), p_include_main = include_main) | |
2676 }, | |
2677 print_figure_2_data = function(){ | |
2678 print(create_stats_for_grouping_figure(list(data_processor_a))) | |
2679 }, | |
2680 print_table_4_data = function(){ | |
2681 report_ranges_of_comparisons(processors = list(data_processor_a)) | |
2682 report_ranges_of_comparisons(processors = list(data_processor_c)) | |
2683 }, | |
2684 plot_figure_3 = function(p_scale=NULL, p_include_main=NULL){ | |
2685 plot_object <- Plot_Compare_PMD_and_Norm_Density$new(p_data_processor = list(data_processor_a), | |
2686 p_show_norm = FALSE, | |
2687 p_include_text = TRUE, | |
2688 p_include_main = p_include_main, | |
2689 p_display_n_psms = FALSE) | |
2690 plot_object$plot_image_in_small_window(p_scale=p_scale) | |
2691 }, | |
2692 plot_figure_4 = function(p_scale=NULL, p_include_main=NULL){ | |
2693 plot_object <- Plot_Time_Invariance_Alt_Before_and_After$new(p_data_processors = list(data_processor_a), | |
2694 p_include_text=TRUE, | |
2695 p_include_main=p_include_main, | |
2696 p_ylim = c(-4,4)) | |
2697 plot_object$plot_image_in_large_window(window_height=4, p_scale=p_scale) | |
2698 | |
2699 }, | |
2700 plot_figure_5 = function(p_scale=NULL, p_include_main=NULL){ | |
2701 plot_object <- Plot_Density_PMD_and_Norm_Decoy_by_AA_Length$new(p_data_processors = list(data_processor_a), | |
2702 p_include_text=TRUE, | |
2703 p_include_main=p_include_main) | |
2704 plot_object$plot_image_in_large_window(window_height=4, p_scale=p_scale) | |
2705 }, | |
2706 plot_figure_6 = function(p_scale=NULL, p_include_main=NULL){ | |
2707 plot_object <- Plot_Bad_CI$new(p_data_processors = list(data_processor_a), | |
2708 p_include_text=TRUE, | |
2709 p_include_main=p_include_main) | |
2710 plot_object$plot_image_in_small_window(p_scale=p_scale) | |
2711 }, | |
2712 plot_figure_7 = function(p_scale=NULL, p_include_main=NULL){ | |
2713 plot_object <- Plot_Compare_iFDR_Confidence_1_Percent_TD_FDR$new(p_data_processors = list(data_processor_a), | |
2714 p_include_text=TRUE, | |
2715 p_include_main=p_include_main) | |
2716 plot_object$plot_image_in_large_window(window_height=4, p_scale=p_scale) | |
2717 }, | |
2718 plot_figure_8 = function(p_scale=NULL, p_include_main=NULL){ | |
2719 plot_object <- Plot_Selective_Loss$new(p_data_processors = list(data_processor_c), | |
2720 p_include_text=TRUE, | |
2721 p_include_main=p_include_main) | |
2722 plot_object$plot_image_in_large_window(window_height=4, p_scale=p_scale) | |
2723 }, | |
2724 plot_figure_A = function(p_scale=NULL, p_include_main=NULL){ | |
2725 plot_object <- Plot_Dataset_Description$new(p_data_processors=list(data_processor_a), | |
2726 p_include_text=TRUE, | |
2727 p_include_main=p_include_main, | |
2728 p_ylim_time_invariance=c(-4,4) ) | |
2729 plot_object$plot_image_in_large_window(window_height=4, p_scale=p_scale) | |
2730 }, | |
2731 plot_figure_B = function(p_scale=NULL, p_include_main=NULL){ | |
2732 plot_object <- Plot_Dataset_Description$new(p_data_processors=list(data_processor_b), | |
2733 p_include_text=TRUE, | |
2734 p_include_main=p_include_main, | |
2735 p_ylim_time_invariance=c(-4,4) ) | |
2736 plot_object$plot_image_in_large_window(window_height=4, p_scale=p_scale) | |
2737 }, | |
2738 plot_figure_C = function(p_scale=NULL, p_include_main=NULL){ | |
2739 plot_object <- Plot_Dataset_Description$new(p_data_processors=list(data_processor_c), | |
2740 p_include_text=TRUE, | |
2741 p_include_main=p_include_main, | |
2742 p_ylim_time_invariance=c(-4,4) ) | |
2743 plot_object$plot_image_in_large_window(window_height=4, p_scale=p_scale) | |
2744 }, | |
2745 plot_figure_D = function(p_scale=NULL, p_include_main=NULL){ | |
2746 plot_object <- Plot_Dataset_Description$new(p_data_processors=list(data_processor_d), | |
2747 p_include_text=TRUE, | |
2748 p_include_main=p_include_main, | |
2749 p_ylim_time_invariance=c(-4,4) ) | |
2750 plot_object$plot_image_in_large_window(window_height=4, p_scale=p_scale) | |
2751 }, | |
2752 create_stats_for_grouping_figure = function(processors=NULL){ | |
2753 processor <- processors[[1]] | |
2754 processor$i_fdr$ensure() | |
2755 aug_i_fdr <- processor$i_fdr$df | |
2756 aug_i_fdr$group_good_bad_other <- gsub("_.*", "", aug_i_fdr$group_training_class) | |
2757 aug_i_fdr$group_null <- "all" | |
2758 table(aug_i_fdr$group_training_class) | |
2759 table(aug_i_fdr$group_good_bad_other) | |
2760 table(aug_i_fdr$group_null) | |
2761 | |
2762 create_agg_fdr_stats <- function(i_fdr=NULL, grouping_var_name = NULL){ | |
2763 formula_fdr <- as.formula(sprintf("%s~%s", "i_fdr", grouping_var_name)) | |
2764 formula_len <- as.formula(sprintf("%s~%s", "PMD_FDR_peptide_length", grouping_var_name)) | |
2765 agg_fdr <- aggregate(formula=formula_fdr, data=i_fdr, FUN=mean) | |
2766 agg_n <- aggregate(formula=formula_fdr, data=i_fdr, FUN=length) | |
2767 agg_len <- aggregate(formula=formula_len, data=i_fdr, FUN=mean) | |
2768 agg_fdr <- rename_columns(df = agg_fdr, | |
2769 names_before = c(grouping_var_name, "i_fdr"), | |
2770 names_after = c("group" , "fdr")) | |
2771 agg_n <- rename_columns(df = agg_n, | |
2772 names_before = c(grouping_var_name, "i_fdr"), | |
2773 names_after = c("group" , "n")) | |
2774 agg_len <- rename_columns(df = agg_len, | |
2775 names_before = c(grouping_var_name), | |
2776 names_after = c("group" )) | |
2777 agg <- merge(agg_fdr, agg_n) | |
2778 agg <- merge(agg , agg_len) | |
2779 | |
2780 return(agg) | |
2781 } | |
2782 | |
2783 agg_detail <- create_agg_fdr_stats(i_fdr = aug_i_fdr, grouping_var_name = "group_training_class") | |
2784 agg_grouped <- create_agg_fdr_stats(i_fdr = aug_i_fdr, grouping_var_name = "group_good_bad_other") | |
2785 agg_all <- create_agg_fdr_stats(i_fdr = aug_i_fdr, grouping_var_name = "group_null") | |
2786 | |
2787 agg <- rbind(agg_detail, agg_grouped) | |
2788 agg <- rbind(agg, agg_all) | |
2789 | |
2790 agg$fdr <- ifelse(agg$fdr < 1, agg$fdr, 1) | |
2791 | |
2792 linear_combo <- function(x=NULL, a0=NULL, a1=NULL){ | |
2793 result <- (a0 * (1-x) + a1 * x) | |
2794 return(result) | |
2795 } | |
2796 | |
2797 agg$r <- linear_combo(agg$fdr, a0=197, a1= 47) | |
2798 agg$g <- linear_combo(agg$fdr, a0= 90, a1= 85) | |
2799 agg$b <- linear_combo(agg$fdr, a0= 17, a1=151) | |
2800 | |
2801 return(agg) | |
2802 }, | |
2803 report_ranges_of_comparisons = function(processors=NULL){ | |
2804 report_comparison_of_Confidence_and_PMD = function (i_fdr = NULL, min_conf=NULL, max_conf=NULL, include_max=FALSE){ | |
2805 report_PMD_confidence_comparison_from_subset = function(data_subset=NULL, group_name=NULL){ | |
2806 print(group_name) | |
2807 print(sprintf(" Number of PSMs: %d", nrow(data_subset))) | |
2808 mean_confidence <- mean(data_subset$PMD_FDR_input_score) | |
2809 print(sprintf(" Mean Confidence Score: %3.1f", mean_confidence)) | |
2810 print(sprintf(" PeptideShaker g-FDR: %3.1f", 100-mean_confidence)) | |
2811 mean_PMD_FDR = mean(data_subset$i_fdr) | |
2812 print(sprintf(" PMD g-FDR: %3.1f", 100*mean_PMD_FDR)) | |
2813 #col <- col2hex("black", 0.2) | |
2814 #plot(data_subset$i_fdr, pch=".", cex=2, col=col) | |
2815 #abline(h=0) | |
2816 } | |
2817 | |
2818 if (is.null(max_conf)) { | |
2819 data_subset <- subset(i_fdr, PMD_FDR_input_score == min_conf) | |
2820 group_name <- sprintf("Group %d", min_conf) | |
2821 } else if (include_max){ | |
2822 data_subset <- subset(i_fdr, (PMD_FDR_input_score >= min_conf) & (PMD_FDR_input_score <= max_conf)) | |
2823 group_name <- sprintf("Group %d through %d", min_conf, max_conf) | |
2824 } else { | |
2825 data_subset <- subset(i_fdr, (PMD_FDR_input_score >= min_conf) & (PMD_FDR_input_score < max_conf)) | |
2826 group_name <- sprintf("Group %d to %d", min_conf, max_conf) | |
2827 } | |
2828 | |
2829 report_PMD_confidence_comparison_from_subset(data_subset=data_subset, group_name=group_name) | |
2830 } | |
2831 | |
2832 processor <- processors[[1]] | |
2833 processor$i_fdr$ensure() | |
2834 i_fdr <- processor$i_fdr$df | |
2835 info <- processor$info | |
2836 print(sprintf("PMD and Confidence comparison for -- %s", info$collection_name())) | |
2837 report_comparison_of_Confidence_and_PMD(i_fdr = i_fdr, min_conf=100, max_conf=NULL, include_max=TRUE) | |
2838 report_comparison_of_Confidence_and_PMD(i_fdr = i_fdr, min_conf= 99, max_conf=100 , include_max=FALSE) | |
2839 report_comparison_of_Confidence_and_PMD(i_fdr = i_fdr, min_conf= 90, max_conf= 99 , include_max=FALSE) | |
2840 report_comparison_of_Confidence_and_PMD(i_fdr = i_fdr, min_conf= 0, max_conf=100 , include_max=TRUE) | |
2841 } | |
2842 ) | |
2843 | |
2844 ############################################################################### | |
2845 # C - 021 - PMD-FDR Wrapper - functions.R # | |
2846 # # | |
2847 # Creates the necessary structure to convert the PMD-FDR code into one that # | |
2848 # can run as a batch file # | |
2849 # # | |
2850 ############################################################################### | |
2851 ############################################################################### | |
2852 # Class: ModuleArgParser_PMD_FDR | |
2853 ############################################################################### | |
2854 ModuleArgParser_PMD_FDR <- setRefClass("ModuleArgParser_PMD_FDR", | |
2855 contains = c("ArgParser"), | |
2856 fields =list(args = "character") ) | |
2857 ModuleArgParser_PMD_FDR$methods( | |
2858 initialize = function(description = "Computes individual and global FDR using Precursor Mass Discrepancy (PMD-FDR)", ...){ | |
2859 callSuper(description=description, ...) | |
2860 local_add_argument("--psm_report" , help="full name and path to the PSM report") | |
2861 local_add_argument("--psm_report_1_percent", default = "" , help="full name and path to the PSM report for 1% FDR") | |
2862 local_add_argument("--output_i_fdr" , default = "" , help="full name and path to the i-FDR output file ") | |
2863 local_add_argument("--output_g_fdr" , default = "" , help="full name and path to the g-FDR output file ") | |
2864 local_add_argument("--output_densities" , default = "" , help="full name and path to the densities output file ") | |
2865 local_add_argument("--score_field_name" , default = "" , help="name of score field (in R format)") | |
2866 local_add_argument("--input_file_type" , default = "PMD_FDR_input_file", help="type of input file (currently supports: PSM_Report)") | |
2867 } | |
2868 ) | |
2869 ############################################################################### | |
2870 # Class: Data_Object_Parser | |
2871 ############################################################################### | |
2872 Data_Object_Parser <- setRefClass("Data_Object_Parser", | |
2873 contains = c("Data_Object"), | |
2874 fields =list(parser = "ModuleArgParser_PMD_FDR", | |
2875 args = "character", | |
2876 parsing_results = "list") ) | |
2877 Data_Object_Parser$methods( | |
2878 initialize = function(){ | |
2879 callSuper() | |
2880 class_name <<- "Data_Object_Parser" | |
2881 }, | |
2882 verify = function(){ | |
2883 # Nothing to do here - parser handles verification during load | |
2884 }, | |
2885 m_load_data = function(){ | |
2886 if (length(args) == 0){ | |
2887 parsing_results <<- parser$parse_arguments(NULL) | |
2888 } else { | |
2889 parsing_results <<- parser$parse_arguments(args) | |
2890 } | |
2891 | |
2892 }, | |
2893 set_args = function(p_args=NULL){ | |
2894 # This is primarily used for testing. In operation arguments will be passed automatically (through use of commandArgs) | |
2895 args <<- p_args | |
2896 set_dirty(TRUE) | |
2897 } | |
2898 ) | |
2899 ############################################################################### | |
2900 # Class: Data_Object_Info_Parser | |
2901 ############################################################################### | |
2902 Data_Object_Info_Parser <- setRefClass("Data_Object_Info_Parser", | |
2903 contains = c("Data_Object_Info"), | |
2904 fields =list( | |
2905 output_i_fdr = "character", | |
2906 output_g_fdr = "character", | |
2907 output_densities = "character" | |
2908 ) ) | |
2909 Data_Object_Info_Parser$methods( | |
2910 initialize = function(){ | |
2911 callSuper() | |
2912 class_name <<- "Data_Object_Info_Parser" | |
2913 }, | |
2914 verify = function(){ | |
2915 check_field_exists = function(field_name=NULL, check_empty = TRUE){ | |
2916 field_value <- get_parser()$parsing_results[field_name] | |
2917 checkTrue(! is.null(field_value), | |
2918 msg = sprintf("Parameter %s was not passed to PMD_FDR", field_value)) | |
2919 if (check_empty){ | |
2920 checkTrue(! is.null(field_value), | |
2921 msg = sprintf("Parameter %s was not passed to PMD_FDR", field_value)) | |
2922 } | |
2923 } | |
2924 # Check parameters passed in | |
2925 check_field_exists("junk") | |
2926 check_field_exists("psm_report") | |
2927 check_field_exists("psm_report_1_percent", check_empty = FALSE) | |
2928 check_field_exists("output_i_fdr" , check_empty = FALSE) | |
2929 check_field_exists("output_g_fdr" , check_empty = FALSE) | |
2930 check_field_exists("output_densities" , check_empty = FALSE) | |
2931 check_field_exists("score_field_name") | |
2932 check_field_exists("input_file_type") | |
2933 }, | |
2934 m_load_data = function(){ | |
2935 parsing_results <- get_parser()$parsing_results | |
2936 | |
2937 data_file_name <<- as.character(parsing_results["psm_report"]) | |
2938 data_file_name_1_percent_FDR <<- as.character(parsing_results["psm_report_1_percent"]) | |
2939 data_path_name <<- as.character(parsing_results[""]) | |
2940 #experiment_name <<- data_file_name | |
2941 #designation <<- "" | |
2942 output_i_fdr <<- as.character(parsing_results["output_i_fdr"]) | |
2943 output_g_fdr <<- as.character(parsing_results["output_g_fdr"]) | |
2944 output_densities <<- as.character(parsing_results["output_densities"]) | |
2945 | |
2946 score_field_name <<- as.character(parsing_results["score_field_name"]) | |
2947 set_input_file_type( as.character(parsing_results["input_file_type"])) | |
2948 }, | |
2949 set_parser = function(parser){ | |
2950 parents[["parser"]] <<- parser | |
2951 }, | |
2952 get_parser = function(){ | |
2953 return(verified_element_of_list(parents, "parser", "Data_Object_Info_Parser$parents")) | |
2954 }, | |
2955 file_path = function(){ | |
2956 result <- data_file_name # Now assumes that full path is provided | |
2957 if (length(result) == 0){ | |
2958 stop("Unable to validate file path - file name is missing") | |
2959 } | |
2960 return(result) | |
2961 }, | |
2962 file_path_1_percent_FDR = function(){ | |
2963 local_file_name <- get_data_file_name_1_percent_FDR() | |
2964 if (length(local_file_name) == 0){ | |
2965 result <- "" | |
2966 } else { | |
2967 result <- local_file_name # path name is no longer relevant | |
2968 } | |
2969 | |
2970 # Continue even if file name is missing - not all analyses have a 1 percent FDR file; this is managed downstream | |
2971 | |
2972 # if (length(result) == 0){ | |
2973 # stop("Unable to validate file path - one or both of path name and file name (of 1 percent FDR file) are missing") | |
2974 # } | |
2975 return(result) | |
2976 }, | |
2977 get_data_file_name_1_percent_FDR = function(){ | |
2978 return(data_file_name_1_percent_FDR) | |
2979 }, | |
2980 collection_name = function(){ | |
2981 result <- "" | |
2982 return(result) | |
2983 }, | |
2984 get_field_name_of_score = function(){ | |
2985 if (length(score_field_name) == 0){ | |
2986 stop("score_field_name has not been set for this project") | |
2987 } | |
2988 return(score_field_name) | |
2989 }, | |
2990 set_input_file_type = function(p_input_file_type=NULL){ | |
2991 if (p_input_file_type == "PSM_Report"){ | |
2992 # do nothing, for now | |
2993 } | |
2994 else if (p_input_file_type == "PMD_FDR_input_file"){ | |
2995 score_field_name <<- "PMD_FDR_input_score" | |
2996 } | |
2997 else { | |
2998 stop(sprintf("input_file_type ('%s') is not currently supported - file_type not changed", p_input_file_type)) | |
2999 } | |
3000 m_input_file_type <<- p_input_file_type | |
3001 }, | |
3002 get_input_file_type = function(){ | |
3003 if (length(m_input_file_type) == 0){ | |
3004 stop("input_file_type has not been set yet (null string)") | |
3005 } | |
3006 if (m_input_file_type == ""){ | |
3007 stop("input_file_type has not been set yet") | |
3008 } | |
3009 | |
3010 return(m_input_file_type) | |
3011 } | |
3012 | |
3013 ) | |
3014 ############################################################################### | |
3015 # Class: Processor_PMD_FDR_for_Galaxy | |
3016 # Purpose: Wrapper on tools from Project 019 to enable a Galaxy-based interface | |
3017 ############################################################################### | |
3018 Processor_PMD_FDR_for_Galaxy <- setRefClass("Processor_PMD_FDR_for_Galaxy", | |
3019 fields = list( | |
3020 parser = "Data_Object_Parser", | |
3021 info = "Data_Object_Info_Parser", | |
3022 raw_data = "Data_Object_Raw_Data", | |
3023 raw_1_percent = "Data_Object_Raw_1_Percent", | |
3024 data_converter = "Data_Object_Data_Converter", | |
3025 data_groups = "Data_Object_Groupings", | |
3026 densities = "Data_Object_Densities", | |
3027 alpha = "Data_Object_Alpha", | |
3028 i_fdr = "Data_Object_Individual_FDR" | |
3029 )) | |
3030 Processor_PMD_FDR_for_Galaxy$methods( | |
3031 initialize = function(){ | |
3032 # This initialization defines all of the dependencies between the various components | |
3033 # (Unfortunately, inheriting from Data_Processor leads to issues - I had to reimplement it here with a change to "info") | |
3034 | |
3035 # info | |
3036 info$set_parser(parser) | |
3037 parser$append_child(info) | |
3038 | |
3039 # raw_data | |
3040 raw_data$set_info(info) | |
3041 info$append_child(raw_data) | |
3042 | |
3043 # raw_1_percent | |
3044 raw_1_percent$set_info(info) | |
3045 info$append_child(raw_1_percent) | |
3046 | |
3047 # data_converter | |
3048 data_converter$set_info (info) | |
3049 data_converter$set_raw_data(raw_data) | |
3050 info $append_child (data_converter) | |
3051 raw_data $append_child (data_converter) | |
3052 | |
3053 # data_groups | |
3054 data_groups$set_info (info) | |
3055 data_groups$set_data_converter(data_converter) | |
3056 data_groups$set_raw_1_percent (raw_1_percent) | |
3057 info $append_child (data_groups) | |
3058 data_converter$append_child (data_groups) | |
3059 raw_1_percent $append_child (data_groups) | |
3060 | |
3061 # densities | |
3062 densities $set_data_groups(data_groups) | |
3063 data_groups$append_child (densities) | |
3064 | |
3065 # alpha | |
3066 alpha $set_densities(densities) | |
3067 densities$append_child (alpha) | |
3068 | |
3069 # i_fdr | |
3070 i_fdr$set_data_groups(data_groups) | |
3071 i_fdr$set_densities (densities) | |
3072 i_fdr$set_alpha (alpha) | |
3073 data_groups $append_child(i_fdr) | |
3074 densities $append_child(i_fdr) | |
3075 alpha $append_child(i_fdr) | |
3076 | |
3077 }, | |
3078 compute = function(){ | |
3079 #i_fdr is currently the lowest level object - it ultimately depends on everything else. | |
3080 i_fdr$ensure() # All pieces on which i_fdr depends are automatically verified and computed (through their verify() and ensure()) | |
3081 | |
3082 save_standard_df(x = densities$df, file_path = info$output_densities) | |
3083 save_standard_df(x = alpha$df, file_path = info$output_g_fdr) | |
3084 save_standard_df(x = i_fdr$df, file_path = info$output_i_fdr) | |
3085 } | |
3086 ) | |
3087 ############################################################################### | |
3088 # D - 021 - PMD-FDR Main.R # | |
3089 # # | |
3090 # File Description: Contains the base code that interprets the parameters # | |
3091 # and computes i-FDR and g-FDR for a mass spec project # | |
3092 # # | |
3093 ############################################################################### | |
3094 argv <- commandArgs(TRUE) # Saves the parameters (command code) | |
3095 | |
3096 processor <- Processor_PMD_FDR_for_Galaxy$new() | |
3097 processor$parser$set_args(argv) | |
3098 processor$compute() | |
3099 |