Mercurial > repos > greg > ideas
changeset 166:7d33a8157c6b draft
Uploaded
author | greg |
---|---|
date | Thu, 18 Jan 2018 13:47:50 -0500 |
parents | bb5544d1c85e |
children | c5b77e9b36f1 |
files | ideas.R |
diffstat | 1 files changed, 153 insertions(+), 72 deletions(-) [+] |
line wrap: on
line diff
--- a/ideas.R Thu Jan 18 13:47:35 2018 -0500 +++ b/ideas.R Thu Jan 18 13:47:50 2018 -0500 @@ -28,8 +28,7 @@ make_option(c("--training_iterations"), action="store", dest="training_iterations", type="integer", default=NULL, help="Number of training iterations"), make_option(c("--training_windows"), action="store", dest="training_windows", type="integer", default=NULL, help="Number of training iterations"), make_option(c("--windows_bed"), action="store", dest="windows_bed", default=NULL, help="Bed file containing bed windows"), - make_option(c("--window_end"), action="store", dest="window_end", type="integer", default=NULL, help="Windows positions by chromosome end value"), - make_option(c("--window_start"), action="store", dest="window_start", type="integer", default=NULL, help="Windows positions by chromosome start value") + make_option(c("--windows_config"), action="store", dest="windows_config", default=NULL, help="Windows positions by chroms config") ) parser <- OptionParser(usage="%prog [options] file", option_list=option_list) @@ -38,11 +37,11 @@ add_output_redirect <- function(cmd, save_ideas_log, output_log, default_log_name) { if (is.null(save_ideas_log)) { - cmd = paste(cmd, "&>>", default_log_name, sep=" "); + new_cmd = c(cmd, "&>>", default_log_name); }else { - cmd = paste(cmd, "&>>", output_log, sep=" "); + new_cmd = c(cmd, "&>>", output_log); } - return(cmd); + return(paste(new_cmd, collapse=" ")); } combine_state <- function(parafiles, method="ward.D", mycut=0.9, pcut=1.0) { @@ -241,6 +240,55 @@ return(dd); } +get_base_cmd <- function(prep_output_config, windows_bed, training_iterations, bychr, hp, norm, log2, + max_states, initial_states, max_position_classes, max_cell_type_clusters, prior_concentration, + burnin_num, mcmc_num, minerr, maxerr, rseed, thread) { + base_cmd = paste("ideas", prep_output_config, sep=" "); + if (!is.null(windows_bed)) { + base_cmd = paste(base_cmd, windows_bed, sep=" "); + } + if (!is.null(training_iterations)) { + base_cmd = paste(base_cmd, "-impute none", sep=" "); + } + if (bychr) { + base_cmd = paste(base_cmd, "-bychr", sep=" "); + } + if (hp) { + base_cmd = paste(base_cmd, "-hp", sep=" "); + } + if (norm) { + base_cmd = paste(base_cmd, "-norm", sep=" "); + } + if (!is.null(log2)) { + base_cmd = paste(base_cmd, "-log2", log2, sep=" "); + } + if (!is.null(max_states)) { + base_cmd = paste(base_cmd, "-G", max_states, sep=" "); + } + if (!is.null(initial_states)) { + base_cmd = paste(base_cmd, "-C", initial_states, sep=" "); + } + if (!is.null(max_position_classes)) { + base_cmd = paste(base_cmd, "-P", max_position_classes, sep=" "); + } + if (!is.null(max_cell_type_clusters)) { + base_cmd = paste(base_cmd, "-K", max_cell_type_clusters, sep=" "); + } + if (!is.null(prior_concentration)) { + base_cmd = paste(base_cmd, "-A", prior_concentration, sep=" "); + } + base_cmd = paste(base_cmd, "-sample", burnin_num, mcmc_num, sep=" "); + if (!is.null(minerr)) { + base_cmd = paste(base_cmd, "-minerr", minerr, sep=" "); + } + if (!is.null(maxerr)) { + base_cmd = paste(base_cmd, "-maxerr", maxerr, sep=" "); + } + base_cmd = paste(base_cmd, "-rseed", rseed, sep=" "); + base_cmd = paste(base_cmd, "-thread", thread, sep=" "); + return(base_cmd); +} + get_mean <- function(n) { N = NULL; for(i in sort(unique(n[,1]))) { @@ -251,6 +299,34 @@ return(array(NN, dim=c(length(NN)/(dim(n)[2]-2), dim(n)[2]-2))); } +get_post_training_base_cmd <- function(base_cmd, para) { + # Change base_cmd due to training mode. + base_cmd_items = as.list(strsplit(base_cmd[1], split=" ", fixed=TRUE))[[1]]; + if (length(which(base_cmd_items == "-G")) == 0) { + base_cmd_items = c(base_cmd_items, "-G", length(para)-1); + } else { + tt = which(base_cmd_items == "-G"); + base_cmd_items[tt + 1] = length(para)-1; + } + tt = which(base_cmd_items == '-C'); + if(length(tt) > 0) { + base_cmd_items = base_cmd_items[-c(tt, tt+1)]; + } + base_cmd = paste(base_cmd_items, collapse=" "); + return(base_cmd); +} + +get_windows_by_chrom <- function(windows_config) { + if (is.null(windows_config)) { + windows_by_chrom = NULL; + } else { + fh = file(windows_config, "r"); + windows_by_chrom = readLines(fh); + close(fh); + } + return(windows_by_chrom); +} + make_parameter <- function(myorder, id, mem, mycut, para) { rt = NULL; j = 0; @@ -264,8 +340,14 @@ return(rt); } +remove_files <- function(path, pattern) { + files = list.files(path=path, pattern=pattern); + for (f in files) { + unlink(f); + } +} + run_cmd <- function(cmd, save_ideas_log, output_log, default_log_name) { - cat("\n\n >>>>> cmd:\n", cmd, "\n\n"); rc = system(cmd); if (rc != 0) { if (is.null(save_ideas_log)) { @@ -276,81 +358,80 @@ } default_log_name = "ideas_log.txt"; +windows_by_chrom = get_windows_by_chrom(opt$windows_config); +base_cmd = get_base_cmd(opt$prep_output_config, opt$windows_bed, opt$training_iterations, opt$bychr, + opt$hp, opt$norm, opt$log2, opt$max_states, opt$initial_states, opt$max_position_classes, + opt$max_cell_type_clusters, opt$prior_concentration, opt$burnin_num, opt$mcmc_num, opt$minerr, + opt$maxerr, opt$rseed, opt$thread); output_base_name = opt$project_name; -cmd = paste("ideas", opt$prep_output_config, sep=" "); -if (!is.null(opt$windows_bed)) { - cmd = paste(cmd, opt$windows_bed, sep=" "); -} -if (!is.null(opt$training_iterations)) { - cmd = paste(cmd, "-impute none", sep=" "); -} -if (opt$bychr) { - cmd = paste(cmd, "-bychr", sep=" "); -} -if (opt$hp) { - cmd = paste(cmd, "-hp", sep=" "); -} -if (opt$norm) { - cmd = paste(cmd, "-norm", sep=" "); -} -if (!is.null(opt$window_start) && !is.null(opt$window_end)) { - cmd = paste(cmd, "-inv", opt$window_start, opt$window_end, sep=" "); -} -if (!is.null(opt$log2)) { - cmd = paste(cmd, "-log2", opt$log2, sep=" "); -} -if (!is.null(opt$max_states)) { - cmd = paste(cmd, "-G", opt$max_states, sep=" "); -} -if (!is.null(opt$initial_states)) { - cmd = paste(cmd, "-C", opt$initial_states, sep=" "); -} -if (!is.null(opt$max_position_classes)) { - cmd = paste(cmd, "-P", opt$max_position_classes, sep=" "); -} -if (!is.null(opt$max_cell_type_clusters)) { - cmd = paste(cmd, "-K", opt$max_cell_type_clusters, sep=" "); -} -if (!is.null(opt$prior_concentration)) { - cmd = paste(cmd, "-A", opt$prior_concentration, sep=" "); -} -cmd = paste(cmd, "-sample", opt$burnin_num, opt$mcmc_num, sep=" "); -if (!is.null(opt$minerr)) { - cmd = paste(cmd, "-minerr", opt$minerr, sep=" "); -} -if (!is.null(opt$maxerr)) { - cmd = paste(cmd, "-maxerr", opt$maxerr, sep=" "); -} -cmd = paste(cmd, "-rseed", opt$rseed, sep=" "); -cmd = paste(cmd, "-thread", opt$thread, sep=" "); if (is.null(opt$training_iterations)) { - final_cmd = paste(cmd, "-o", output_base_name, sep=" "); - final_cmd = add_output_redirect(final_cmd, opt$save_ideas_log, opt$output_log, default_log_name); - run_cmd(final_cmd, opt$save_ideas_log, opt$output_log, default_log_name); + # Not performing training. + if (is.null(windows_by_chrom)) { + # Not performing windows by chromosome. + output_name = output_base_name; + cmd = paste(base_cmd, "-o", output_name, sep=" "); + cmd = add_output_redirect(cmd, opt$save_ideas_log, opt$output_log, default_log_name); + run_cmd(cmd, opt$save_ideas_log, opt$output_log, default_log_name); + } else { + # Performing windows by chromosome. + for (i in 1:length(windows_by_chrom)) { + line = windows_by_chrom[i]; + items = strsplit(line, " ")[[1]]; + chrom = items[1]; + window_start = items[2]; + window_end = items[3]; + output_name = paste(output_base_name,, chrom, sep="."); + cmd = paste(base_cmd, "-inv", window_start, window_end, sep=" "); + cmd = paste(cmd, "-o", output_name, sep=" "); + cmd = add_output_redirect(cmd, opt$save_ideas_log, opt$output_log, default_log_name); + run_cmd(cmd, opt$save_ideas_log, opt$output_log, default_log_name); + } + } } else { - output_para0 = paste(output_base_name, ".para0", sep=""); - output_profile0 = paste(output_base_name, ".profile0", sep=""); + # performing training. + output_para0 = paste(output_base_name, "para0", sep="."); + output_profile0 = paste(output_base_name, "profile0", sep="."); for (i in 1:opt$training_iterations) { - final_cmd = paste(cmd, "-o", paste(output_base_name, ".tmp.", i, sep=""), sep=" "); - final_cmd = add_output_redirect(final_cmd, opt$save_ideas_log, opt$output_log, default_log_name); - run_cmd(final_cmd, opt$save_ideas_log, opt$output_log, default_log_name); + cmd = paste(base_cmd, "-o", paste(output_base_name, ".tmp.", i, sep=""), sep=" "); + cmd = add_output_redirect(cmd, opt$save_ideas_log, opt$output_log, default_log_name); + run_cmd(cmd, opt$save_ideas_log, opt$output_log, default_log_name); } - tpara = combine_state(paste(output_base_name, ".tmp.", (1:opt$training_iterations), ".para", sep=""), mycut=0.5); + tpara = combine_state(paste(output_base_name, "tmp", (1:opt$training_iterations), "para", sep="."), mycut=0.5); + write.table(tpara$profile, output_profile0, quote=F, row.names=F, col.names=F); para = tpara$para; - write.table(tpara$profile, output_profile0, quote=F, row.names=F, col.names=F); para = apply(para, 1, function(x){paste(x, collapse=" ")}); - para = c(readLines(paste(output_base_name, ".tmp.1.para", sep=""), n=1), para); + para = c(readLines(paste(output_base_name, "tmp", "1", "para", sep="."), n=1), para); writeLines(para, output_para0); - cmd = c(cmd, "-otherpara", output_para0, output_profile0); - if (length(which(cmd == "-G")) == 0) { - cmd = c(cmd, "-G", length(para)-1); + # Now run IDEAS based on the files produced during training. + base_cmd = get_post_training_base_cmd(base_cmd, para); + base_cmd = paste(base_cmd, "-otherpara", output_para0[[1]], output_profile0[[1]], sep=" "); + if (is.null(windows_by_chrom)) { + cmd = c(base_cmd, "-o", output_base_name); + cmd = add_output_redirect(cmd, opt$save_ideas_log, opt$output_log, default_log_name); + run_cmd(cmd, opt$save_ideas_log, opt$output_log, default_log_name); } else { - tt = which(cmd == "-G"); - cmd[tt + 1] = length(para)-1; + # Performing windows by chromosome. + if (length(windows_by_chrom) == 1) { + output_name = paste(output_base_name, i, sep="."); + cmd = c(base_cmd, "-o", output_name); + cmd = add_output_redirect(cmd, opt$save_ideas_log, opt$output_log, default_log_name); + run_cmd(cmd, opt$save_ideas_log, opt$output_log, default_log_name); + } else { + for (i in 1:length(windows_by_chrom)) { + line = windows_by_chrom[i]; + items = strsplit(line, " ")[[1]]; + chrom = items[[1]]; + window_start = items[[2]]; + window_end = items[[3]]; + cmd = paste(base_cmd, "-inv", window_start, window_end, sep=" "); + output_name = paste(output_base_name, chrom, sep="."); + cmd = paste(cmd, "-o", output_name, sep=" "); + cmd = add_output_redirect(cmd, opt$save_ideas_log, opt$output_log, default_log_name); + run_cmd(cmd, opt$save_ideas_log, opt$output_log, default_log_name); + } + } } - tt = which(cmd == '-C'); - if(length(tt)>0) { - cmd = cmd[-c(tt, tt+1)]; - } + # Remove temporary outputs. + remove_files(path=".", pattern="tmp"); }