comparison ideas.R @ 174:b0ca3591242e draft

Uploaded
author greg
date Thu, 25 Jan 2018 11:53:15 -0500
parents 59ed3d424524
children 2b6b5e2769f5
comparison
equal deleted inserted replaced
173:843bfa2dff2c 174:b0ca3591242e
2 2
3 suppressPackageStartupMessages(library("data.table")) 3 suppressPackageStartupMessages(library("data.table"))
4 suppressPackageStartupMessages(library("optparse")) 4 suppressPackageStartupMessages(library("optparse"))
5 5
6 option_list <- list( 6 option_list <- list(
7 make_option(c("--burnin_num"), action="store", dest="burnin_num", type="integer", help="Number of burnin steps"), 7 make_option(c("--burnin_num"), action="store", dest="burnin_num", type="integer", help="Number of burnin steps"),
8 make_option(c("--bychr"), action="store_true", dest="bychr", default=FALSE, help="Output chromosomes in separate files"), 8 make_option(c("--bychr"), action="store_true", dest="bychr", default=FALSE, help="Output chromosomes in separate files"),
9 make_option(c("--chrom_bed_input"), action="store", dest="chrom_bed_input", default=NULL, help="Chromosome windows positions file"), 9 make_option(c("--chrom_bed_input"), action="store", dest="chrom_bed_input", default=NULL, help="Chromosome windows positions file"),
10 make_option(c("--chromosome_windows"), action="store", dest="chromosome_windows", default=NULL, help="Windows positions by chroms config file"), 10 make_option(c("--chromosome_windows"), action="store", dest="chromosome_windows", default=NULL, help="Windows positions by chroms config file"),
11 make_option(c("--hp"), action="store_true", dest="hp", default=FALSE, help="Discourage state transition across chromosomes"), 11 make_option(c("--hp"), action="store_true", dest="hp", default=FALSE, help="Discourage state transition across chromosomes"),
12 make_option(c("--initial_states"), action="store", dest="initial_states", type="integer", default=NULL, help="Initial number of states"), 12 make_option(c("--initial_states"), action="store", dest="initial_states", type="integer", default=NULL, help="Initial number of states"),
13 make_option(c("--input"), action="store", dest="input", help="IdeasPre input dataset"), 13 make_option(c("--input"), action="store", dest="input", help="IdeasPre input dataset"),
14 make_option(c("--input_files_path"), action="store", dest="input_files_path", help="IdeasPre input dataset extra files path"), 14 make_option(c("--input_files_path"), action="store", dest="input_files_path", help="IdeasPre input dataset extra files path"),
15 make_option(c("--ideas_input_config"), action="store", dest="ideas_input_config", help="IDEAS_input_config file"), 15 make_option(c("--ideas_input_config"), action="store", dest="ideas_input_config", help="IDEAS_input_config file"),
16 make_option(c("--log2"), action="store", dest="log2", type="double", default=NULL, help="log2 transformation"), 16 make_option(c("--log2"), action="store", dest="log2", type="double", default=NULL, help="log2 transformation"),
17 make_option(c("--maxerr"), action="store", dest="maxerr", type="double", default=NULL, help="Maximum standard deviation for the emission Gaussian distribution"), 17 make_option(c("--maxerr"), action="store", dest="maxerr", type="double", default=NULL, help="Maximum standard deviation for the emission Gaussian distribution"),
18 make_option(c("--max_cell_type_clusters"), action="store", dest="max_cell_type_clusters", type="integer", default=NULL, help="Maximum number of cell type clusters allowed"), 18 make_option(c("--max_cell_type_clusters"), action="store", dest="max_cell_type_clusters", type="integer", default=NULL, help="Maximum number of cell type clusters allowed"),
19 make_option(c("--max_position_classes"), action="store", dest="max_position_classes", type="integer", default=NULL, help="Maximum number of position classes to be inferred"), 19 make_option(c("--max_position_classes"), action="store", dest="max_position_classes", type="integer", default=NULL, help="Maximum number of position classes to be inferred"),
20 make_option(c("--max_states"), action="store", dest="max_states", type="double", default=NULL, help="Maximum number of states to be inferred"), 20 make_option(c("--max_states"), action="store", dest="max_states", type="double", default=NULL, help="Maximum number of states to be inferred"),
21 make_option(c("--mcmc_num"), action="store", dest="mcmc_num", type="integer", help="Number of maximization steps"), 21 make_option(c("--mcmc_num"), action="store", dest="mcmc_num", type="integer", help="Number of maximization steps"),
22 make_option(c("--minerr"), action="store", dest="minerr", type="double", default=NULL, help="Minimum standard deviation for the emission Gaussian distribution"), 22 make_option(c("--minerr"), action="store", dest="minerr", type="double", default=NULL, help="Minimum standard deviation for the emission Gaussian distribution"),
23 make_option(c("--output_log"), action="store", dest="output_log", default=NULL, help="Output log file path"), 23 make_option(c("--output_dir"), action="store", dest="output_dir", help="Output directory, used only if job ends in error and process log needs saving"),
24 make_option(c("--prior_concentration"), action="store", dest="prior_concentration", type="double", default=NULL, help="Prior concentration"), 24 make_option(c("--output_log"), action="store", dest="output_log", default=NULL, help="Output log file path"),
25 make_option(c("--project_name"), action="store", dest="project_name", help="Outputs will have this base name"), 25 make_option(c("--prior_concentration"), action="store", dest="prior_concentration", type="double", default=NULL, help="Prior concentration"),
26 make_option(c("--rseed"), action="store", dest="rseed", type="integer", help="Seed for IDEAS model initialization"), 26 make_option(c("--project_name"), action="store", dest="project_name", help="Outputs will have this base name"),
27 make_option(c("--save_ideas_log"), action="store", dest="save_ideas_log", default=NULL, help="Flag to save IDEAS process log"), 27 make_option(c("--rseed"), action="store", dest="rseed", type="integer", help="Seed for IDEAS model initialization"),
28 make_option(c("--standardize_datasets"), action="store_true", dest="standardize_datasets", default=FALSE, help="Standardize all datasets"), 28 make_option(c("--save_ideas_log"), action="store", dest="save_ideas_log", default=NULL, help="Flag to save IDEAS process log"),
29 make_option(c("--thread"), action="store", dest="thread", type="integer", help="Process threads"), 29 make_option(c("--standardize_datasets"), action="store_true", dest="standardize_datasets", default=FALSE, help="Standardize all datasets"),
30 make_option(c("--training_iterations"), action="store", dest="training_iterations", type="integer", default=NULL, help="Number of training iterations"), 30 make_option(c("--thread"), action="store", dest="thread", type="integer", help="Process threads"),
31 make_option(c("--training_windows"), action="store", dest="training_windows", type="integer", default=NULL, help="Number of training iterations") 31 make_option(c("--training_iterations"), action="store", dest="training_iterations", type="integer", default=NULL, help="Number of training iterations"),
32 make_option(c("--training_windows"), action="store", dest="training_windows", type="integer", default=NULL, help="Number of training iterations")
32 ) 33 )
33 34
34 parser <- OptionParser(usage="%prog [options] file", option_list=option_list) 35 parser <- OptionParser(usage="%prog [options] file", option_list=option_list)
35 args <- parse_args(parser, positional_arguments=TRUE) 36 args <- parse_args(parser, positional_arguments=TRUE)
36 opt <- args$options 37 opt <- args$options
37 38
38 add_output_redirect <- function(cmd, save_ideas_log, output_log, default_log_name) { 39 add_output_redirect <- function(cmd, output_log) {
39 if (is.null(save_ideas_log)) { 40 new_cmd = c(cmd, "&>>", output_log);
40 new_cmd = c(cmd, "&>>", default_log_name);
41 }else {
42 new_cmd = c(cmd, "&>>", output_log);
43 }
44 return(paste(new_cmd, collapse=" ")); 41 return(paste(new_cmd, collapse=" "));
45 } 42 }
46 43
47 combine_state <- function(parafiles, method="ward.D", mycut=0.9, pcut=1.0) { 44 combine_state <- function(parafiles, method="ward.D", mycut=0.9, pcut=1.0) {
48 X = NULL; 45 X = NULL;
349 for (f in files) { 346 for (f in files) {
350 unlink(f); 347 unlink(f);
351 } 348 }
352 } 349 }
353 350
354 run_cmd <- function(cmd, save_ideas_log, output_log, default_log_name) { 351 run_cmd <- function(cmd, save_ideas_log, output_log, output_dir) {
352 cat("save_ideas_log: ", save_ideas_log, "\n");
353 cat("output_log: ", output_log, "\n");
354 cat("output_dir: ", output_dir, "\n");
355 cat("\nRunning cmd:\n", cmd, "\n\n");
355 rc = system(cmd); 356 rc = system(cmd);
356 if (rc != 0) { 357 if (rc != 0) {
357 if (is.null(save_ideas_log)) { 358 if (is.null(save_ideas_log)) {
358 file.rename(default_log_name, output_log); 359 to_path = paste(output_dir, output_log, sep="/");
360 file.rename(output_log, to_path);
359 } 361 }
360 quit(save="no", status=rc); 362 quit(save="no", status=rc);
361 } 363 }
362 } 364 }
363 365
364 # Initialize values. 366 # Initialize values.
365 default_log_name = "ideas_log.txt"; 367 if (is.null(opt$save_ideas_log)) {
368 output_log = "ideas_log.txt";
369 } else {
370 output_log = opt$output_log;
371 }
366 # Get full path of chromosomes.bed if not NULL. 372 # Get full path of chromosomes.bed if not NULL.
367 chrom_bed_input = get_file_path(opt$input_files_path, opt$chrom_bed_input); 373 chrom_bed_input = get_file_path(opt$input_files_path, opt$chrom_bed_input);
374 cat("chrom_bed_input: ", chrom_bed_input, "\n");
368 # Get full path of chromosome_windows.txt if not NULL. 375 # Get full path of chromosome_windows.txt if not NULL.
369 chromosome_windows = get_file_path(opt$input_files_path, opt$chromosome_windows); 376 chromosome_windows = get_file_path(opt$input_files_path, opt$chromosome_windows);
377 cat("chromosome_windows: ", chromosome_windows, "\n");
370 if (is.null(chromosome_windows)) { 378 if (is.null(chromosome_windows)) {
371 windows_by_chrom = NULL; 379 windows_by_chrom = NULL;
372 } else { 380 } else {
373 # Read chromosome_windows.txt into memory. 381 # Read chromosome_windows.txt into memory.
374 windows_by_chrom = get_windows_by_chrom(chromosome_windows); 382 windows_by_chrom = get_windows_by_chrom(chromosome_windows);
375 } 383 }
376 ideas_input_config = get_file_path(opt$input_files_path, opt$ideas_input_config); 384 ideas_input_config = get_file_path(opt$input_files_path, opt$ideas_input_config);
385 cat("ideas_input_config: ", ideas_input_config, "\n");
377 base_cmd = get_base_cmd(ideas_input_config, chrom_bed_input, opt$training_iterations, opt$bychr, opt$hp, 386 base_cmd = get_base_cmd(ideas_input_config, chrom_bed_input, opt$training_iterations, opt$bychr, opt$hp,
378 opt$standardize_datasets, opt$log2, opt$max_states, opt$initial_states, opt$max_position_classes, 387 opt$standardize_datasets, opt$log2, opt$max_states, opt$initial_states, opt$max_position_classes,
379 opt$max_cell_type_clusters, opt$prior_concentration, opt$burnin_num, opt$mcmc_num, opt$minerr, 388 opt$max_cell_type_clusters, opt$prior_concentration, opt$burnin_num, opt$mcmc_num, opt$minerr,
380 opt$maxerr, opt$rseed, opt$thread); 389 opt$maxerr, opt$rseed, opt$thread);
390 cat("base_cmd: ", base_cmd, "\n");
381 output_base_name = opt$project_name; 391 output_base_name = opt$project_name;
392 cat("output_base_name: ", output_base_name, "\n");
382 393
383 if (is.null(opt$training_iterations)) { 394 if (is.null(opt$training_iterations)) {
384 # Not performing training. 395 # Not performing training.
385 if (is.null(windows_by_chrom)) { 396 if (is.null(windows_by_chrom)) {
386 # Not performing windows by chromosome. 397 # Not performing windows by chromosome.
387 output_name = output_base_name; 398 output_name = output_base_name;
388 cmd = paste(base_cmd, "-o", output_name, sep=" "); 399 cmd = paste(base_cmd, "-o", output_name, sep=" ");
389 cmd = add_output_redirect(cmd, opt$save_ideas_log, opt$output_log, default_log_name); 400 cmd = add_output_redirect(cmd, output_log);
390 run_cmd(cmd, opt$save_ideas_log, opt$output_log, default_log_name); 401 run_cmd(cmd, opt$save_ideas_log, output_log, opt$output_dir);
391 } else { 402 } else {
392 # Performing windows by chromosome. 403 # Performing windows by chromosome.
393 for (i in 1:length(windows_by_chrom)) { 404 for (i in 1:length(windows_by_chrom)) {
394 line = windows_by_chrom[i]; 405 line = windows_by_chrom[i];
395 items = strsplit(line, " ")[[1]]; 406 items = strsplit(line, " ")[[1]];
397 window_start = items[2]; 408 window_start = items[2];
398 window_end = items[3]; 409 window_end = items[3];
399 output_name = paste(output_base_name, chrom, sep="."); 410 output_name = paste(output_base_name, chrom, sep=".");
400 cmd = paste(base_cmd, "-inv", window_start, window_end, sep=" "); 411 cmd = paste(base_cmd, "-inv", window_start, window_end, sep=" ");
401 cmd = paste(cmd, "-o", output_name, sep=" "); 412 cmd = paste(cmd, "-o", output_name, sep=" ");
402 cmd = add_output_redirect(cmd, opt$save_ideas_log, opt$output_log, default_log_name); 413 cmd = add_output_redirect(cmd, output_log);
403 run_cmd(cmd, opt$save_ideas_log, opt$output_log, default_log_name); 414 run_cmd(cmd, opt$save_ideas_log, output_log, opt$output_dir);
404 } 415 }
405 } 416 }
406 } else { 417 } else {
407 # performing training. 418 # performing training.
408 output_para0 = paste(output_base_name, "para0", sep="."); 419 output_para0 = paste(output_base_name, "para0", sep=".");
409 output_profile0 = paste(output_base_name, "profile0", sep="."); 420 output_profile0 = paste(output_base_name, "profile0", sep=".");
410 for (i in 1:opt$training_iterations) { 421 for (i in 1:opt$training_iterations) {
411 cmd = paste(base_cmd, "-o", paste(output_base_name, ".tmp.", i, sep=""), sep=" "); 422 cmd = paste(base_cmd, "-o", paste(output_base_name, ".tmp.", i, sep=""), sep=" ");
412 cmd = add_output_redirect(cmd, opt$save_ideas_log, opt$output_log, default_log_name); 423 cmd = add_output_redirect(cmd, output_log);
413 run_cmd(cmd, opt$save_ideas_log, opt$output_log, default_log_name); 424 run_cmd(cmd, opt$save_ideas_log, output_log, opt$output_dir);
414 } 425 }
415 tpara = combine_state(paste(output_base_name, "tmp", (1:opt$training_iterations), "para", sep="."), mycut=0.5); 426 tpara = combine_state(paste(output_base_name, "tmp", (1:opt$training_iterations), "para", sep="."), mycut=0.5);
416 write.table(tpara$profile, output_profile0, quote=F, row.names=F, col.names=F); 427 write.table(tpara$profile, output_profile0, quote=F, row.names=F, col.names=F);
417 para = tpara$para; 428 para = tpara$para;
418 para = apply(para, 1, function(x){paste(x, collapse=" ")}); 429 para = apply(para, 1, function(x){paste(x, collapse=" ")});
421 # Now run IDEAS based on the files produced during training. 432 # Now run IDEAS based on the files produced during training.
422 base_cmd = get_post_training_base_cmd(base_cmd, para); 433 base_cmd = get_post_training_base_cmd(base_cmd, para);
423 base_cmd = paste(base_cmd, "-otherpara", output_para0[[1]], output_profile0[[1]], sep=" "); 434 base_cmd = paste(base_cmd, "-otherpara", output_para0[[1]], output_profile0[[1]], sep=" ");
424 if (is.null(windows_by_chrom)) { 435 if (is.null(windows_by_chrom)) {
425 cmd = c(base_cmd, "-o", output_base_name); 436 cmd = c(base_cmd, "-o", output_base_name);
426 cmd = add_output_redirect(cmd, opt$save_ideas_log, opt$output_log, default_log_name); 437 cmd = add_output_redirect(cmd, output_log);
427 run_cmd(cmd, opt$save_ideas_log, opt$output_log, default_log_name); 438 run_cmd(cmd, opt$save_ideas_log, output_log, opt$output_dir);
428 } else { 439 } else {
429 # Performing windows by chromosome. 440 # Performing windows by chromosome.
430 if (length(windows_by_chrom) == 1) { 441 if (length(windows_by_chrom) == 1) {
431 output_name = paste(output_base_name, i, sep="."); 442 output_name = paste(output_base_name, i, sep=".");
432 cmd = c(base_cmd, "-o", output_name); 443 cmd = c(base_cmd, "-o", output_name);
433 cmd = add_output_redirect(cmd, opt$save_ideas_log, opt$output_log, default_log_name); 444 cmd = add_output_redirect(cmd, output_log);
434 run_cmd(cmd, opt$save_ideas_log, opt$output_log, default_log_name); 445 run_cmd(cmd, opt$save_ideas_log, output_log, opt$output_dir);
435 } else { 446 } else {
436 for (i in 1:length(windows_by_chrom)) { 447 for (i in 1:length(windows_by_chrom)) {
437 line = windows_by_chrom[i]; 448 line = windows_by_chrom[i];
438 items = strsplit(line, " ")[[1]]; 449 items = strsplit(line, " ")[[1]];
439 chrom = items[[1]]; 450 chrom = items[[1]];
440 window_start = items[[2]]; 451 window_start = items[[2]];
441 window_end = items[[3]]; 452 window_end = items[[3]];
442 cmd = paste(base_cmd, "-inv", window_start, window_end, sep=" "); 453 cmd = paste(base_cmd, "-inv", window_start, window_end, sep=" ");
443 output_name = paste(output_base_name, chrom, sep="."); 454 output_name = paste(output_base_name, chrom, sep=".");
444 cmd = paste(cmd, "-o", output_name, sep=" "); 455 cmd = paste(cmd, "-o", output_name, sep=" ");
445 cmd = add_output_redirect(cmd, opt$save_ideas_log, opt$output_log, default_log_name); 456 cmd = add_output_redirect(cmd, output_log);
446 run_cmd(cmd, opt$save_ideas_log, opt$output_log, default_log_name); 457 run_cmd(cmd, opt$save_ideas_log, output_log, opt$output_dir);
447 } 458 }
448 } 459 }
449 } 460 }
450 # Remove temporary outputs. 461 # Remove temporary outputs.
451 remove_files(path=".", pattern="tmp"); 462 remove_files(path=".", pattern="tmp");