Mercurial > repos > greg > ideas
comparison ideas.R @ 174:b0ca3591242e draft
Uploaded
author | greg |
---|---|
date | Thu, 25 Jan 2018 11:53:15 -0500 |
parents | 59ed3d424524 |
children | 2b6b5e2769f5 |
comparison
equal
deleted
inserted
replaced
173:843bfa2dff2c | 174:b0ca3591242e |
---|---|
2 | 2 |
3 suppressPackageStartupMessages(library("data.table")) | 3 suppressPackageStartupMessages(library("data.table")) |
4 suppressPackageStartupMessages(library("optparse")) | 4 suppressPackageStartupMessages(library("optparse")) |
5 | 5 |
6 option_list <- list( | 6 option_list <- list( |
7 make_option(c("--burnin_num"), action="store", dest="burnin_num", type="integer", help="Number of burnin steps"), | 7 make_option(c("--burnin_num"), action="store", dest="burnin_num", type="integer", help="Number of burnin steps"), |
8 make_option(c("--bychr"), action="store_true", dest="bychr", default=FALSE, help="Output chromosomes in separate files"), | 8 make_option(c("--bychr"), action="store_true", dest="bychr", default=FALSE, help="Output chromosomes in separate files"), |
9 make_option(c("--chrom_bed_input"), action="store", dest="chrom_bed_input", default=NULL, help="Chromosome windows positions file"), | 9 make_option(c("--chrom_bed_input"), action="store", dest="chrom_bed_input", default=NULL, help="Chromosome windows positions file"), |
10 make_option(c("--chromosome_windows"), action="store", dest="chromosome_windows", default=NULL, help="Windows positions by chroms config file"), | 10 make_option(c("--chromosome_windows"), action="store", dest="chromosome_windows", default=NULL, help="Windows positions by chroms config file"), |
11 make_option(c("--hp"), action="store_true", dest="hp", default=FALSE, help="Discourage state transition across chromosomes"), | 11 make_option(c("--hp"), action="store_true", dest="hp", default=FALSE, help="Discourage state transition across chromosomes"), |
12 make_option(c("--initial_states"), action="store", dest="initial_states", type="integer", default=NULL, help="Initial number of states"), | 12 make_option(c("--initial_states"), action="store", dest="initial_states", type="integer", default=NULL, help="Initial number of states"), |
13 make_option(c("--input"), action="store", dest="input", help="IdeasPre input dataset"), | 13 make_option(c("--input"), action="store", dest="input", help="IdeasPre input dataset"), |
14 make_option(c("--input_files_path"), action="store", dest="input_files_path", help="IdeasPre input dataset extra files path"), | 14 make_option(c("--input_files_path"), action="store", dest="input_files_path", help="IdeasPre input dataset extra files path"), |
15 make_option(c("--ideas_input_config"), action="store", dest="ideas_input_config", help="IDEAS_input_config file"), | 15 make_option(c("--ideas_input_config"), action="store", dest="ideas_input_config", help="IDEAS_input_config file"), |
16 make_option(c("--log2"), action="store", dest="log2", type="double", default=NULL, help="log2 transformation"), | 16 make_option(c("--log2"), action="store", dest="log2", type="double", default=NULL, help="log2 transformation"), |
17 make_option(c("--maxerr"), action="store", dest="maxerr", type="double", default=NULL, help="Maximum standard deviation for the emission Gaussian distribution"), | 17 make_option(c("--maxerr"), action="store", dest="maxerr", type="double", default=NULL, help="Maximum standard deviation for the emission Gaussian distribution"), |
18 make_option(c("--max_cell_type_clusters"), action="store", dest="max_cell_type_clusters", type="integer", default=NULL, help="Maximum number of cell type clusters allowed"), | 18 make_option(c("--max_cell_type_clusters"), action="store", dest="max_cell_type_clusters", type="integer", default=NULL, help="Maximum number of cell type clusters allowed"), |
19 make_option(c("--max_position_classes"), action="store", dest="max_position_classes", type="integer", default=NULL, help="Maximum number of position classes to be inferred"), | 19 make_option(c("--max_position_classes"), action="store", dest="max_position_classes", type="integer", default=NULL, help="Maximum number of position classes to be inferred"), |
20 make_option(c("--max_states"), action="store", dest="max_states", type="double", default=NULL, help="Maximum number of states to be inferred"), | 20 make_option(c("--max_states"), action="store", dest="max_states", type="double", default=NULL, help="Maximum number of states to be inferred"), |
21 make_option(c("--mcmc_num"), action="store", dest="mcmc_num", type="integer", help="Number of maximization steps"), | 21 make_option(c("--mcmc_num"), action="store", dest="mcmc_num", type="integer", help="Number of maximization steps"), |
22 make_option(c("--minerr"), action="store", dest="minerr", type="double", default=NULL, help="Minimum standard deviation for the emission Gaussian distribution"), | 22 make_option(c("--minerr"), action="store", dest="minerr", type="double", default=NULL, help="Minimum standard deviation for the emission Gaussian distribution"), |
23 make_option(c("--output_log"), action="store", dest="output_log", default=NULL, help="Output log file path"), | 23 make_option(c("--output_dir"), action="store", dest="output_dir", help="Output directory, used only if job ends in error and process log needs saving"), |
24 make_option(c("--prior_concentration"), action="store", dest="prior_concentration", type="double", default=NULL, help="Prior concentration"), | 24 make_option(c("--output_log"), action="store", dest="output_log", default=NULL, help="Output log file path"), |
25 make_option(c("--project_name"), action="store", dest="project_name", help="Outputs will have this base name"), | 25 make_option(c("--prior_concentration"), action="store", dest="prior_concentration", type="double", default=NULL, help="Prior concentration"), |
26 make_option(c("--rseed"), action="store", dest="rseed", type="integer", help="Seed for IDEAS model initialization"), | 26 make_option(c("--project_name"), action="store", dest="project_name", help="Outputs will have this base name"), |
27 make_option(c("--save_ideas_log"), action="store", dest="save_ideas_log", default=NULL, help="Flag to save IDEAS process log"), | 27 make_option(c("--rseed"), action="store", dest="rseed", type="integer", help="Seed for IDEAS model initialization"), |
28 make_option(c("--standardize_datasets"), action="store_true", dest="standardize_datasets", default=FALSE, help="Standardize all datasets"), | 28 make_option(c("--save_ideas_log"), action="store", dest="save_ideas_log", default=NULL, help="Flag to save IDEAS process log"), |
29 make_option(c("--thread"), action="store", dest="thread", type="integer", help="Process threads"), | 29 make_option(c("--standardize_datasets"), action="store_true", dest="standardize_datasets", default=FALSE, help="Standardize all datasets"), |
30 make_option(c("--training_iterations"), action="store", dest="training_iterations", type="integer", default=NULL, help="Number of training iterations"), | 30 make_option(c("--thread"), action="store", dest="thread", type="integer", help="Process threads"), |
31 make_option(c("--training_windows"), action="store", dest="training_windows", type="integer", default=NULL, help="Number of training iterations") | 31 make_option(c("--training_iterations"), action="store", dest="training_iterations", type="integer", default=NULL, help="Number of training iterations"), |
32 make_option(c("--training_windows"), action="store", dest="training_windows", type="integer", default=NULL, help="Number of training iterations") | |
32 ) | 33 ) |
33 | 34 |
34 parser <- OptionParser(usage="%prog [options] file", option_list=option_list) | 35 parser <- OptionParser(usage="%prog [options] file", option_list=option_list) |
35 args <- parse_args(parser, positional_arguments=TRUE) | 36 args <- parse_args(parser, positional_arguments=TRUE) |
36 opt <- args$options | 37 opt <- args$options |
37 | 38 |
38 add_output_redirect <- function(cmd, save_ideas_log, output_log, default_log_name) { | 39 add_output_redirect <- function(cmd, output_log) { |
39 if (is.null(save_ideas_log)) { | 40 new_cmd = c(cmd, "&>>", output_log); |
40 new_cmd = c(cmd, "&>>", default_log_name); | |
41 }else { | |
42 new_cmd = c(cmd, "&>>", output_log); | |
43 } | |
44 return(paste(new_cmd, collapse=" ")); | 41 return(paste(new_cmd, collapse=" ")); |
45 } | 42 } |
46 | 43 |
47 combine_state <- function(parafiles, method="ward.D", mycut=0.9, pcut=1.0) { | 44 combine_state <- function(parafiles, method="ward.D", mycut=0.9, pcut=1.0) { |
48 X = NULL; | 45 X = NULL; |
349 for (f in files) { | 346 for (f in files) { |
350 unlink(f); | 347 unlink(f); |
351 } | 348 } |
352 } | 349 } |
353 | 350 |
354 run_cmd <- function(cmd, save_ideas_log, output_log, default_log_name) { | 351 run_cmd <- function(cmd, save_ideas_log, output_log, output_dir) { |
352 cat("save_ideas_log: ", save_ideas_log, "\n"); | |
353 cat("output_log: ", output_log, "\n"); | |
354 cat("output_dir: ", output_dir, "\n"); | |
355 cat("\nRunning cmd:\n", cmd, "\n\n"); | |
355 rc = system(cmd); | 356 rc = system(cmd); |
356 if (rc != 0) { | 357 if (rc != 0) { |
357 if (is.null(save_ideas_log)) { | 358 if (is.null(save_ideas_log)) { |
358 file.rename(default_log_name, output_log); | 359 to_path = paste(output_dir, output_log, sep="/"); |
360 file.rename(output_log, to_path); | |
359 } | 361 } |
360 quit(save="no", status=rc); | 362 quit(save="no", status=rc); |
361 } | 363 } |
362 } | 364 } |
363 | 365 |
364 # Initialize values. | 366 # Initialize values. |
365 default_log_name = "ideas_log.txt"; | 367 if (is.null(opt$save_ideas_log)) { |
368 output_log = "ideas_log.txt"; | |
369 } else { | |
370 output_log = opt$output_log; | |
371 } | |
366 # Get full path of chromosomes.bed if not NULL. | 372 # Get full path of chromosomes.bed if not NULL. |
367 chrom_bed_input = get_file_path(opt$input_files_path, opt$chrom_bed_input); | 373 chrom_bed_input = get_file_path(opt$input_files_path, opt$chrom_bed_input); |
374 cat("chrom_bed_input: ", chrom_bed_input, "\n"); | |
368 # Get full path of chromosome_windows.txt if not NULL. | 375 # Get full path of chromosome_windows.txt if not NULL. |
369 chromosome_windows = get_file_path(opt$input_files_path, opt$chromosome_windows); | 376 chromosome_windows = get_file_path(opt$input_files_path, opt$chromosome_windows); |
377 cat("chromosome_windows: ", chromosome_windows, "\n"); | |
370 if (is.null(chromosome_windows)) { | 378 if (is.null(chromosome_windows)) { |
371 windows_by_chrom = NULL; | 379 windows_by_chrom = NULL; |
372 } else { | 380 } else { |
373 # Read chromosome_windows.txt into memory. | 381 # Read chromosome_windows.txt into memory. |
374 windows_by_chrom = get_windows_by_chrom(chromosome_windows); | 382 windows_by_chrom = get_windows_by_chrom(chromosome_windows); |
375 } | 383 } |
376 ideas_input_config = get_file_path(opt$input_files_path, opt$ideas_input_config); | 384 ideas_input_config = get_file_path(opt$input_files_path, opt$ideas_input_config); |
385 cat("ideas_input_config: ", ideas_input_config, "\n"); | |
377 base_cmd = get_base_cmd(ideas_input_config, chrom_bed_input, opt$training_iterations, opt$bychr, opt$hp, | 386 base_cmd = get_base_cmd(ideas_input_config, chrom_bed_input, opt$training_iterations, opt$bychr, opt$hp, |
378 opt$standardize_datasets, opt$log2, opt$max_states, opt$initial_states, opt$max_position_classes, | 387 opt$standardize_datasets, opt$log2, opt$max_states, opt$initial_states, opt$max_position_classes, |
379 opt$max_cell_type_clusters, opt$prior_concentration, opt$burnin_num, opt$mcmc_num, opt$minerr, | 388 opt$max_cell_type_clusters, opt$prior_concentration, opt$burnin_num, opt$mcmc_num, opt$minerr, |
380 opt$maxerr, opt$rseed, opt$thread); | 389 opt$maxerr, opt$rseed, opt$thread); |
390 cat("base_cmd: ", base_cmd, "\n"); | |
381 output_base_name = opt$project_name; | 391 output_base_name = opt$project_name; |
392 cat("output_base_name: ", output_base_name, "\n"); | |
382 | 393 |
383 if (is.null(opt$training_iterations)) { | 394 if (is.null(opt$training_iterations)) { |
384 # Not performing training. | 395 # Not performing training. |
385 if (is.null(windows_by_chrom)) { | 396 if (is.null(windows_by_chrom)) { |
386 # Not performing windows by chromosome. | 397 # Not performing windows by chromosome. |
387 output_name = output_base_name; | 398 output_name = output_base_name; |
388 cmd = paste(base_cmd, "-o", output_name, sep=" "); | 399 cmd = paste(base_cmd, "-o", output_name, sep=" "); |
389 cmd = add_output_redirect(cmd, opt$save_ideas_log, opt$output_log, default_log_name); | 400 cmd = add_output_redirect(cmd, output_log); |
390 run_cmd(cmd, opt$save_ideas_log, opt$output_log, default_log_name); | 401 run_cmd(cmd, opt$save_ideas_log, output_log, opt$output_dir); |
391 } else { | 402 } else { |
392 # Performing windows by chromosome. | 403 # Performing windows by chromosome. |
393 for (i in 1:length(windows_by_chrom)) { | 404 for (i in 1:length(windows_by_chrom)) { |
394 line = windows_by_chrom[i]; | 405 line = windows_by_chrom[i]; |
395 items = strsplit(line, " ")[[1]]; | 406 items = strsplit(line, " ")[[1]]; |
397 window_start = items[2]; | 408 window_start = items[2]; |
398 window_end = items[3]; | 409 window_end = items[3]; |
399 output_name = paste(output_base_name, chrom, sep="."); | 410 output_name = paste(output_base_name, chrom, sep="."); |
400 cmd = paste(base_cmd, "-inv", window_start, window_end, sep=" "); | 411 cmd = paste(base_cmd, "-inv", window_start, window_end, sep=" "); |
401 cmd = paste(cmd, "-o", output_name, sep=" "); | 412 cmd = paste(cmd, "-o", output_name, sep=" "); |
402 cmd = add_output_redirect(cmd, opt$save_ideas_log, opt$output_log, default_log_name); | 413 cmd = add_output_redirect(cmd, output_log); |
403 run_cmd(cmd, opt$save_ideas_log, opt$output_log, default_log_name); | 414 run_cmd(cmd, opt$save_ideas_log, output_log, opt$output_dir); |
404 } | 415 } |
405 } | 416 } |
406 } else { | 417 } else { |
407 # performing training. | 418 # performing training. |
408 output_para0 = paste(output_base_name, "para0", sep="."); | 419 output_para0 = paste(output_base_name, "para0", sep="."); |
409 output_profile0 = paste(output_base_name, "profile0", sep="."); | 420 output_profile0 = paste(output_base_name, "profile0", sep="."); |
410 for (i in 1:opt$training_iterations) { | 421 for (i in 1:opt$training_iterations) { |
411 cmd = paste(base_cmd, "-o", paste(output_base_name, ".tmp.", i, sep=""), sep=" "); | 422 cmd = paste(base_cmd, "-o", paste(output_base_name, ".tmp.", i, sep=""), sep=" "); |
412 cmd = add_output_redirect(cmd, opt$save_ideas_log, opt$output_log, default_log_name); | 423 cmd = add_output_redirect(cmd, output_log); |
413 run_cmd(cmd, opt$save_ideas_log, opt$output_log, default_log_name); | 424 run_cmd(cmd, opt$save_ideas_log, output_log, opt$output_dir); |
414 } | 425 } |
415 tpara = combine_state(paste(output_base_name, "tmp", (1:opt$training_iterations), "para", sep="."), mycut=0.5); | 426 tpara = combine_state(paste(output_base_name, "tmp", (1:opt$training_iterations), "para", sep="."), mycut=0.5); |
416 write.table(tpara$profile, output_profile0, quote=F, row.names=F, col.names=F); | 427 write.table(tpara$profile, output_profile0, quote=F, row.names=F, col.names=F); |
417 para = tpara$para; | 428 para = tpara$para; |
418 para = apply(para, 1, function(x){paste(x, collapse=" ")}); | 429 para = apply(para, 1, function(x){paste(x, collapse=" ")}); |
421 # Now run IDEAS based on the files produced during training. | 432 # Now run IDEAS based on the files produced during training. |
422 base_cmd = get_post_training_base_cmd(base_cmd, para); | 433 base_cmd = get_post_training_base_cmd(base_cmd, para); |
423 base_cmd = paste(base_cmd, "-otherpara", output_para0[[1]], output_profile0[[1]], sep=" "); | 434 base_cmd = paste(base_cmd, "-otherpara", output_para0[[1]], output_profile0[[1]], sep=" "); |
424 if (is.null(windows_by_chrom)) { | 435 if (is.null(windows_by_chrom)) { |
425 cmd = c(base_cmd, "-o", output_base_name); | 436 cmd = c(base_cmd, "-o", output_base_name); |
426 cmd = add_output_redirect(cmd, opt$save_ideas_log, opt$output_log, default_log_name); | 437 cmd = add_output_redirect(cmd, output_log); |
427 run_cmd(cmd, opt$save_ideas_log, opt$output_log, default_log_name); | 438 run_cmd(cmd, opt$save_ideas_log, output_log, opt$output_dir); |
428 } else { | 439 } else { |
429 # Performing windows by chromosome. | 440 # Performing windows by chromosome. |
430 if (length(windows_by_chrom) == 1) { | 441 if (length(windows_by_chrom) == 1) { |
431 output_name = paste(output_base_name, i, sep="."); | 442 output_name = paste(output_base_name, i, sep="."); |
432 cmd = c(base_cmd, "-o", output_name); | 443 cmd = c(base_cmd, "-o", output_name); |
433 cmd = add_output_redirect(cmd, opt$save_ideas_log, opt$output_log, default_log_name); | 444 cmd = add_output_redirect(cmd, output_log); |
434 run_cmd(cmd, opt$save_ideas_log, opt$output_log, default_log_name); | 445 run_cmd(cmd, opt$save_ideas_log, output_log, opt$output_dir); |
435 } else { | 446 } else { |
436 for (i in 1:length(windows_by_chrom)) { | 447 for (i in 1:length(windows_by_chrom)) { |
437 line = windows_by_chrom[i]; | 448 line = windows_by_chrom[i]; |
438 items = strsplit(line, " ")[[1]]; | 449 items = strsplit(line, " ")[[1]]; |
439 chrom = items[[1]]; | 450 chrom = items[[1]]; |
440 window_start = items[[2]]; | 451 window_start = items[[2]]; |
441 window_end = items[[3]]; | 452 window_end = items[[3]]; |
442 cmd = paste(base_cmd, "-inv", window_start, window_end, sep=" "); | 453 cmd = paste(base_cmd, "-inv", window_start, window_end, sep=" "); |
443 output_name = paste(output_base_name, chrom, sep="."); | 454 output_name = paste(output_base_name, chrom, sep="."); |
444 cmd = paste(cmd, "-o", output_name, sep=" "); | 455 cmd = paste(cmd, "-o", output_name, sep=" "); |
445 cmd = add_output_redirect(cmd, opt$save_ideas_log, opt$output_log, default_log_name); | 456 cmd = add_output_redirect(cmd, output_log); |
446 run_cmd(cmd, opt$save_ideas_log, opt$output_log, default_log_name); | 457 run_cmd(cmd, opt$save_ideas_log, output_log, opt$output_dir); |
447 } | 458 } |
448 } | 459 } |
449 } | 460 } |
450 # Remove temporary outputs. | 461 # Remove temporary outputs. |
451 remove_files(path=".", pattern="tmp"); | 462 remove_files(path=".", pattern="tmp"); |