changeset 166:7d33a8157c6b draft

Uploaded
author greg
date Thu, 18 Jan 2018 13:47:50 -0500
parents bb5544d1c85e
children c5b77e9b36f1
files ideas.R
diffstat 1 files changed, 153 insertions(+), 72 deletions(-) [+]
line wrap: on
line diff
--- a/ideas.R	Thu Jan 18 13:47:35 2018 -0500
+++ b/ideas.R	Thu Jan 18 13:47:50 2018 -0500
@@ -28,8 +28,7 @@
     make_option(c("--training_iterations"), action="store", dest="training_iterations", type="integer", default=NULL, help="Number of training iterations"),
     make_option(c("--training_windows"), action="store", dest="training_windows", type="integer", default=NULL, help="Number of training iterations"),
     make_option(c("--windows_bed"), action="store", dest="windows_bed", default=NULL, help="Bed file containing bed windows"),
-    make_option(c("--window_end"), action="store", dest="window_end", type="integer", default=NULL, help="Windows positions by chromosome end value"),
-    make_option(c("--window_start"), action="store", dest="window_start", type="integer", default=NULL, help="Windows positions by chromosome start value")
+    make_option(c("--windows_config"), action="store", dest="windows_config", default=NULL, help="Windows positions by chroms config")
 )
 
 parser <- OptionParser(usage="%prog [options] file", option_list=option_list)
@@ -38,11 +37,11 @@
 
 add_output_redirect <- function(cmd, save_ideas_log, output_log, default_log_name) {
     if (is.null(save_ideas_log)) {
-        cmd = paste(cmd, "&>>", default_log_name, sep=" ");
+        new_cmd = c(cmd, "&>>", default_log_name);
     }else {
-        cmd = paste(cmd, "&>>", output_log, sep=" ");
+        new_cmd = c(cmd, "&>>", output_log);
     }
-    return(cmd);
+    return(paste(new_cmd, collapse=" "));
 }
 
 combine_state <- function(parafiles, method="ward.D", mycut=0.9, pcut=1.0) {
@@ -241,6 +240,55 @@
     return(dd);
 }
 
+get_base_cmd <- function(prep_output_config, windows_bed, training_iterations, bychr, hp, norm, log2,
+        max_states, initial_states, max_position_classes, max_cell_type_clusters, prior_concentration,
+        burnin_num, mcmc_num, minerr, maxerr, rseed, thread) {
+    base_cmd = paste("ideas", prep_output_config, sep=" ");
+    if (!is.null(windows_bed)) {
+        base_cmd = paste(base_cmd, windows_bed, sep=" ");
+    }
+    if (!is.null(training_iterations)) {
+        base_cmd = paste(base_cmd, "-impute none", sep=" ");
+    }
+    if (bychr) {
+        base_cmd = paste(base_cmd, "-bychr", sep=" ");
+    }
+    if (hp) {
+        base_cmd = paste(base_cmd, "-hp", sep=" ");
+    }
+    if (norm) {
+        base_cmd = paste(base_cmd, "-norm", sep=" ");
+    }
+    if (!is.null(log2)) {
+        base_cmd = paste(base_cmd, "-log2", log2, sep=" ");
+    }
+    if (!is.null(max_states)) {
+        base_cmd = paste(base_cmd, "-G", max_states, sep=" ");
+    }
+    if (!is.null(initial_states)) {
+        base_cmd = paste(base_cmd, "-C", initial_states, sep=" ");
+    }
+    if (!is.null(max_position_classes)) {
+        base_cmd = paste(base_cmd, "-P", max_position_classes, sep=" ");
+    }
+    if (!is.null(max_cell_type_clusters)) {
+        base_cmd = paste(base_cmd, "-K", max_cell_type_clusters, sep=" ");
+    }
+    if (!is.null(prior_concentration)) {
+        base_cmd = paste(base_cmd, "-A", prior_concentration, sep=" ");
+    }
+    base_cmd = paste(base_cmd, "-sample", burnin_num, mcmc_num, sep=" ");
+    if (!is.null(minerr)) {
+        base_cmd = paste(base_cmd, "-minerr", minerr, sep=" ");
+    }
+    if (!is.null(maxerr)) {
+        base_cmd = paste(base_cmd, "-maxerr", maxerr, sep=" ");
+    }
+    base_cmd = paste(base_cmd, "-rseed", rseed, sep=" ");
+    base_cmd = paste(base_cmd, "-thread", thread, sep=" ");
+    return(base_cmd);
+}
+
 get_mean <- function(n) {
     N = NULL;
     for(i in sort(unique(n[,1]))) {
@@ -251,6 +299,34 @@
     return(array(NN, dim=c(length(NN)/(dim(n)[2]-2), dim(n)[2]-2)));
 }
 
+get_post_training_base_cmd <- function(base_cmd, para) {
+    # Change base_cmd due to training mode.
+    base_cmd_items = as.list(strsplit(base_cmd[1], split=" ", fixed=TRUE))[[1]];
+    if (length(which(base_cmd_items == "-G")) == 0) {
+        base_cmd_items = c(base_cmd_items, "-G", length(para)-1);
+    } else {
+        tt = which(base_cmd_items == "-G");
+        base_cmd_items[tt + 1] = length(para)-1;
+    }
+    tt = which(base_cmd_items == '-C');
+    if(length(tt) > 0) {
+        base_cmd_items = base_cmd_items[-c(tt, tt+1)];
+    }
+    base_cmd = paste(base_cmd_items, collapse=" ");
+    return(base_cmd);
+}
+
+get_windows_by_chrom <- function(windows_config) {
+    if (is.null(windows_config)) {
+        windows_by_chrom = NULL;
+    } else {
+        fh = file(windows_config, "r");
+        windows_by_chrom = readLines(fh);
+        close(fh);
+    }
+    return(windows_by_chrom);
+}
+
 make_parameter <- function(myorder, id, mem, mycut, para) {
     rt = NULL;
     j = 0;
@@ -264,8 +340,14 @@
     return(rt);
 }
 
+remove_files <- function(path, pattern) {
+    files = list.files(path=path, pattern=pattern);
+    for (f in files) {
+        unlink(f);
+    }
+}
+
 run_cmd <- function(cmd, save_ideas_log, output_log, default_log_name) {
-    cat("\n\n >>>>> cmd:\n", cmd, "\n\n");
     rc = system(cmd);
     if (rc != 0) {
         if (is.null(save_ideas_log)) {
@@ -276,81 +358,80 @@
 }
 
 default_log_name = "ideas_log.txt";
+windows_by_chrom = get_windows_by_chrom(opt$windows_config);
+base_cmd = get_base_cmd(opt$prep_output_config, opt$windows_bed, opt$training_iterations, opt$bychr,
+        opt$hp, opt$norm, opt$log2, opt$max_states, opt$initial_states, opt$max_position_classes,
+        opt$max_cell_type_clusters, opt$prior_concentration, opt$burnin_num, opt$mcmc_num, opt$minerr,
+        opt$maxerr, opt$rseed, opt$thread);
 output_base_name = opt$project_name;
-cmd = paste("ideas", opt$prep_output_config, sep=" ");
-if (!is.null(opt$windows_bed)) {
-    cmd = paste(cmd, opt$windows_bed, sep=" ");
-}
-if (!is.null(opt$training_iterations)) {
-    cmd = paste(cmd, "-impute none", sep=" ");
-}
-if (opt$bychr) {
-    cmd = paste(cmd, "-bychr", sep=" ");
-}
-if (opt$hp) {
-    cmd = paste(cmd, "-hp", sep=" ");
-}
-if (opt$norm) {
-    cmd = paste(cmd, "-norm", sep=" ");
-}
-if (!is.null(opt$window_start) && !is.null(opt$window_end)) {
-    cmd = paste(cmd, "-inv", opt$window_start, opt$window_end, sep=" ");
-}
-if (!is.null(opt$log2)) {
-    cmd = paste(cmd, "-log2", opt$log2, sep=" ");
-}
-if (!is.null(opt$max_states)) {
-    cmd = paste(cmd, "-G", opt$max_states, sep=" ");
-}
-if (!is.null(opt$initial_states)) {
-    cmd = paste(cmd, "-C", opt$initial_states, sep=" ");
-}
-if (!is.null(opt$max_position_classes)) {
-    cmd = paste(cmd, "-P", opt$max_position_classes, sep=" ");
-}
-if (!is.null(opt$max_cell_type_clusters)) {
-    cmd = paste(cmd, "-K", opt$max_cell_type_clusters, sep=" ");
-}
-if (!is.null(opt$prior_concentration)) {
-    cmd = paste(cmd, "-A", opt$prior_concentration, sep=" ");
-}
-cmd = paste(cmd, "-sample", opt$burnin_num, opt$mcmc_num, sep=" ");
-if (!is.null(opt$minerr)) {
-    cmd = paste(cmd, "-minerr", opt$minerr, sep=" ");
-}
-if (!is.null(opt$maxerr)) {
-    cmd = paste(cmd, "-maxerr", opt$maxerr, sep=" ");
-}
-cmd = paste(cmd, "-rseed", opt$rseed, sep=" ");
-cmd = paste(cmd, "-thread", opt$thread, sep=" ");
 
 if (is.null(opt$training_iterations)) {
-    final_cmd = paste(cmd, "-o", output_base_name, sep=" ");
-    final_cmd = add_output_redirect(final_cmd, opt$save_ideas_log, opt$output_log, default_log_name);
-    run_cmd(final_cmd, opt$save_ideas_log, opt$output_log, default_log_name);
+    # Not performing training.
+    if (is.null(windows_by_chrom)) {
+        # Not performing windows by chromosome.
+        output_name = output_base_name;
+        cmd = paste(base_cmd, "-o", output_name, sep=" ");
+        cmd = add_output_redirect(cmd, opt$save_ideas_log, opt$output_log, default_log_name);
+        run_cmd(cmd, opt$save_ideas_log, opt$output_log, default_log_name);
+    } else {
+        # Performing windows by chromosome.
+        for (i in 1:length(windows_by_chrom)) {
+            line = windows_by_chrom[i];
+            items = strsplit(line, " ")[[1]];
+            chrom = items[1];
+            window_start = items[2];
+            window_end = items[3];
+            output_name = paste(output_base_name,, chrom, sep=".");
+            cmd = paste(base_cmd, "-inv", window_start, window_end, sep=" ");
+            cmd = paste(cmd, "-o", output_name, sep=" ");
+            cmd = add_output_redirect(cmd, opt$save_ideas_log, opt$output_log, default_log_name);
+            run_cmd(cmd, opt$save_ideas_log, opt$output_log, default_log_name);
+        }
+    }
 } else {
-    output_para0 = paste(output_base_name, ".para0", sep="");
-    output_profile0 = paste(output_base_name, ".profile0", sep="");
+    # performing training.
+    output_para0 = paste(output_base_name, "para0", sep=".");
+    output_profile0 = paste(output_base_name, "profile0", sep=".");
     for (i in 1:opt$training_iterations) {
-        final_cmd = paste(cmd, "-o", paste(output_base_name, ".tmp.", i, sep=""), sep=" ");
-        final_cmd = add_output_redirect(final_cmd, opt$save_ideas_log, opt$output_log, default_log_name);
-        run_cmd(final_cmd, opt$save_ideas_log, opt$output_log, default_log_name);
+        cmd = paste(base_cmd, "-o", paste(output_base_name, ".tmp.", i, sep=""), sep=" ");
+        cmd = add_output_redirect(cmd, opt$save_ideas_log, opt$output_log, default_log_name);
+        run_cmd(cmd, opt$save_ideas_log, opt$output_log, default_log_name);
     }
-    tpara = combine_state(paste(output_base_name, ".tmp.", (1:opt$training_iterations), ".para", sep=""), mycut=0.5);
+    tpara = combine_state(paste(output_base_name, "tmp", (1:opt$training_iterations), "para", sep="."), mycut=0.5);
+    write.table(tpara$profile, output_profile0, quote=F, row.names=F, col.names=F);
     para = tpara$para;
-    write.table(tpara$profile, output_profile0, quote=F, row.names=F, col.names=F);
     para = apply(para, 1, function(x){paste(x, collapse=" ")});
-    para = c(readLines(paste(output_base_name, ".tmp.1.para", sep=""), n=1), para);
+    para = c(readLines(paste(output_base_name, "tmp", "1", "para", sep="."), n=1), para);
     writeLines(para, output_para0);
-    cmd = c(cmd, "-otherpara", output_para0, output_profile0);
-    if (length(which(cmd == "-G")) == 0) {
-        cmd = c(cmd, "-G", length(para)-1);
+    # Now run IDEAS based on the files produced during training.
+    base_cmd = get_post_training_base_cmd(base_cmd, para);
+    base_cmd = paste(base_cmd, "-otherpara", output_para0[[1]], output_profile0[[1]], sep=" ");
+    if (is.null(windows_by_chrom)) {
+        cmd = c(base_cmd, "-o", output_base_name);
+        cmd = add_output_redirect(cmd, opt$save_ideas_log, opt$output_log, default_log_name);
+        run_cmd(cmd, opt$save_ideas_log, opt$output_log, default_log_name);
     } else {
-        tt = which(cmd == "-G");
-        cmd[tt + 1] = length(para)-1;
+        # Performing windows by chromosome.
+        if (length(windows_by_chrom) == 1) {
+            output_name = paste(output_base_name, i, sep=".");
+            cmd = c(base_cmd, "-o", output_name);
+            cmd = add_output_redirect(cmd, opt$save_ideas_log, opt$output_log, default_log_name);
+            run_cmd(cmd, opt$save_ideas_log, opt$output_log, default_log_name);
+        } else {
+            for (i in 1:length(windows_by_chrom)) {
+                line = windows_by_chrom[i];
+                items = strsplit(line, " ")[[1]];
+                chrom = items[[1]];
+                window_start = items[[2]];
+                window_end = items[[3]];
+                cmd = paste(base_cmd, "-inv", window_start, window_end, sep=" ");
+                output_name = paste(output_base_name, chrom, sep=".");
+                cmd = paste(cmd, "-o", output_name, sep=" ");
+                cmd = add_output_redirect(cmd, opt$save_ideas_log, opt$output_log, default_log_name);
+                run_cmd(cmd, opt$save_ideas_log, opt$output_log, default_log_name);
+            }
+        }
     }
-    tt = which(cmd == '-C');
-    if(length(tt)>0) {
-        cmd = cmd[-c(tt, tt+1)];
-    }
+    # Remove temporary outputs.
+    remove_files(path=".", pattern="tmp");
 }