comparison ideas.R @ 166:7d33a8157c6b draft

Uploaded
author greg
date Thu, 18 Jan 2018 13:47:50 -0500
parents b140097a984e
children c5b77e9b36f1
comparison
equal deleted inserted replaced
165:bb5544d1c85e 166:7d33a8157c6b
26 make_option(c("--thread"), action="store", dest="thread", type="integer", help="Process threads"), 26 make_option(c("--thread"), action="store", dest="thread", type="integer", help="Process threads"),
27 make_option(c("--tmp_dir"), action="store", dest="tmp_dir", help="Directory of bed files"), 27 make_option(c("--tmp_dir"), action="store", dest="tmp_dir", help="Directory of bed files"),
28 make_option(c("--training_iterations"), action="store", dest="training_iterations", type="integer", default=NULL, help="Number of training iterations"), 28 make_option(c("--training_iterations"), action="store", dest="training_iterations", type="integer", default=NULL, help="Number of training iterations"),
29 make_option(c("--training_windows"), action="store", dest="training_windows", type="integer", default=NULL, help="Number of training iterations"), 29 make_option(c("--training_windows"), action="store", dest="training_windows", type="integer", default=NULL, help="Number of training iterations"),
30 make_option(c("--windows_bed"), action="store", dest="windows_bed", default=NULL, help="Bed file containing bed windows"), 30 make_option(c("--windows_bed"), action="store", dest="windows_bed", default=NULL, help="Bed file containing bed windows"),
31 make_option(c("--window_end"), action="store", dest="window_end", type="integer", default=NULL, help="Windows positions by chromosome end value"), 31 make_option(c("--windows_config"), action="store", dest="windows_config", default=NULL, help="Windows positions by chroms config")
32 make_option(c("--window_start"), action="store", dest="window_start", type="integer", default=NULL, help="Windows positions by chromosome start value")
33 ) 32 )
34 33
35 parser <- OptionParser(usage="%prog [options] file", option_list=option_list) 34 parser <- OptionParser(usage="%prog [options] file", option_list=option_list)
36 args <- parse_args(parser, positional_arguments=TRUE) 35 args <- parse_args(parser, positional_arguments=TRUE)
37 opt <- args$options 36 opt <- args$options
38 37
39 add_output_redirect <- function(cmd, save_ideas_log, output_log, default_log_name) { 38 add_output_redirect <- function(cmd, save_ideas_log, output_log, default_log_name) {
40 if (is.null(save_ideas_log)) { 39 if (is.null(save_ideas_log)) {
41 cmd = paste(cmd, "&>>", default_log_name, sep=" "); 40 new_cmd = c(cmd, "&>>", default_log_name);
42 }else { 41 }else {
43 cmd = paste(cmd, "&>>", output_log, sep=" "); 42 new_cmd = c(cmd, "&>>", output_log);
44 } 43 }
45 return(cmd); 44 return(paste(new_cmd, collapse=" "));
46 } 45 }
47 46
48 combine_state <- function(parafiles, method="ward.D", mycut=0.9, pcut=1.0) { 47 combine_state <- function(parafiles, method="ward.D", mycut=0.9, pcut=1.0) {
49 X = NULL; 48 X = NULL;
50 K = NULL; 49 K = NULL;
239 dd[i+dim(NN)[1]] = min(apply(array(NN[,1:p], dim=c(dim(NN)[1],p)), 1, function(x){sqrt(sum((x-MM[i,1:p])^2))})); 238 dd[i+dim(NN)[1]] = min(apply(array(NN[,1:p], dim=c(dim(NN)[1],p)), 1, function(x){sqrt(sum((x-MM[i,1:p])^2))}));
240 } 239 }
241 return(dd); 240 return(dd);
242 } 241 }
243 242
243 get_base_cmd <- function(prep_output_config, windows_bed, training_iterations, bychr, hp, norm, log2,
244 max_states, initial_states, max_position_classes, max_cell_type_clusters, prior_concentration,
245 burnin_num, mcmc_num, minerr, maxerr, rseed, thread) {
246 base_cmd = paste("ideas", prep_output_config, sep=" ");
247 if (!is.null(windows_bed)) {
248 base_cmd = paste(base_cmd, windows_bed, sep=" ");
249 }
250 if (!is.null(training_iterations)) {
251 base_cmd = paste(base_cmd, "-impute none", sep=" ");
252 }
253 if (bychr) {
254 base_cmd = paste(base_cmd, "-bychr", sep=" ");
255 }
256 if (hp) {
257 base_cmd = paste(base_cmd, "-hp", sep=" ");
258 }
259 if (norm) {
260 base_cmd = paste(base_cmd, "-norm", sep=" ");
261 }
262 if (!is.null(log2)) {
263 base_cmd = paste(base_cmd, "-log2", log2, sep=" ");
264 }
265 if (!is.null(max_states)) {
266 base_cmd = paste(base_cmd, "-G", max_states, sep=" ");
267 }
268 if (!is.null(initial_states)) {
269 base_cmd = paste(base_cmd, "-C", initial_states, sep=" ");
270 }
271 if (!is.null(max_position_classes)) {
272 base_cmd = paste(base_cmd, "-P", max_position_classes, sep=" ");
273 }
274 if (!is.null(max_cell_type_clusters)) {
275 base_cmd = paste(base_cmd, "-K", max_cell_type_clusters, sep=" ");
276 }
277 if (!is.null(prior_concentration)) {
278 base_cmd = paste(base_cmd, "-A", prior_concentration, sep=" ");
279 }
280 base_cmd = paste(base_cmd, "-sample", burnin_num, mcmc_num, sep=" ");
281 if (!is.null(minerr)) {
282 base_cmd = paste(base_cmd, "-minerr", minerr, sep=" ");
283 }
284 if (!is.null(maxerr)) {
285 base_cmd = paste(base_cmd, "-maxerr", maxerr, sep=" ");
286 }
287 base_cmd = paste(base_cmd, "-rseed", rseed, sep=" ");
288 base_cmd = paste(base_cmd, "-thread", thread, sep=" ");
289 return(base_cmd);
290 }
291
244 get_mean <- function(n) { 292 get_mean <- function(n) {
245 N = NULL; 293 N = NULL;
246 for(i in sort(unique(n[,1]))) { 294 for(i in sort(unique(n[,1]))) {
247 t = which(n[,1]==i); 295 t = which(n[,1]==i);
248 N = rbind(N, apply(array(n[t,], dim=c(length(t), dim(n)[2])), 2, sum)[-1]); 296 N = rbind(N, apply(array(n[t,], dim=c(length(t), dim(n)[2])), 2, sum)[-1]);
249 } 297 }
250 NN = N[,-1] / N[,1]; 298 NN = N[,-1] / N[,1];
251 return(array(NN, dim=c(length(NN)/(dim(n)[2]-2), dim(n)[2]-2))); 299 return(array(NN, dim=c(length(NN)/(dim(n)[2]-2), dim(n)[2]-2)));
300 }
301
302 get_post_training_base_cmd <- function(base_cmd, para) {
303 # Change base_cmd due to training mode.
304 base_cmd_items = as.list(strsplit(base_cmd[1], split=" ", fixed=TRUE))[[1]];
305 if (length(which(base_cmd_items == "-G")) == 0) {
306 base_cmd_items = c(base_cmd_items, "-G", length(para)-1);
307 } else {
308 tt = which(base_cmd_items == "-G");
309 base_cmd_items[tt + 1] = length(para)-1;
310 }
311 tt = which(base_cmd_items == '-C');
312 if(length(tt) > 0) {
313 base_cmd_items = base_cmd_items[-c(tt, tt+1)];
314 }
315 base_cmd = paste(base_cmd_items, collapse=" ");
316 return(base_cmd);
317 }
318
319 get_windows_by_chrom <- function(windows_config) {
320 if (is.null(windows_config)) {
321 windows_by_chrom = NULL;
322 } else {
323 fh = file(windows_config, "r");
324 windows_by_chrom = readLines(fh);
325 close(fh);
326 }
327 return(windows_by_chrom);
252 } 328 }
253 329
254 make_parameter <- function(myorder, id, mem, mycut, para) { 330 make_parameter <- function(myorder, id, mem, mycut, para) {
255 rt = NULL; 331 rt = NULL;
256 j = 0; 332 j = 0;
262 } 338 }
263 } 339 }
264 return(rt); 340 return(rt);
265 } 341 }
266 342
343 remove_files <- function(path, pattern) {
344 files = list.files(path=path, pattern=pattern);
345 for (f in files) {
346 unlink(f);
347 }
348 }
349
267 run_cmd <- function(cmd, save_ideas_log, output_log, default_log_name) { 350 run_cmd <- function(cmd, save_ideas_log, output_log, default_log_name) {
268 cat("\n\n >>>>> cmd:\n", cmd, "\n\n");
269 rc = system(cmd); 351 rc = system(cmd);
270 if (rc != 0) { 352 if (rc != 0) {
271 if (is.null(save_ideas_log)) { 353 if (is.null(save_ideas_log)) {
272 file.rename(default_log_name, output_log); 354 file.rename(default_log_name, output_log);
273 } 355 }
274 quit(rc); 356 quit(rc);
275 } 357 }
276 } 358 }
277 359
278 default_log_name = "ideas_log.txt"; 360 default_log_name = "ideas_log.txt";
361 windows_by_chrom = get_windows_by_chrom(opt$windows_config);
362 base_cmd = get_base_cmd(opt$prep_output_config, opt$windows_bed, opt$training_iterations, opt$bychr,
363 opt$hp, opt$norm, opt$log2, opt$max_states, opt$initial_states, opt$max_position_classes,
364 opt$max_cell_type_clusters, opt$prior_concentration, opt$burnin_num, opt$mcmc_num, opt$minerr,
365 opt$maxerr, opt$rseed, opt$thread);
279 output_base_name = opt$project_name; 366 output_base_name = opt$project_name;
280 cmd = paste("ideas", opt$prep_output_config, sep=" ");
281 if (!is.null(opt$windows_bed)) {
282 cmd = paste(cmd, opt$windows_bed, sep=" ");
283 }
284 if (!is.null(opt$training_iterations)) {
285 cmd = paste(cmd, "-impute none", sep=" ");
286 }
287 if (opt$bychr) {
288 cmd = paste(cmd, "-bychr", sep=" ");
289 }
290 if (opt$hp) {
291 cmd = paste(cmd, "-hp", sep=" ");
292 }
293 if (opt$norm) {
294 cmd = paste(cmd, "-norm", sep=" ");
295 }
296 if (!is.null(opt$window_start) && !is.null(opt$window_end)) {
297 cmd = paste(cmd, "-inv", opt$window_start, opt$window_end, sep=" ");
298 }
299 if (!is.null(opt$log2)) {
300 cmd = paste(cmd, "-log2", opt$log2, sep=" ");
301 }
302 if (!is.null(opt$max_states)) {
303 cmd = paste(cmd, "-G", opt$max_states, sep=" ");
304 }
305 if (!is.null(opt$initial_states)) {
306 cmd = paste(cmd, "-C", opt$initial_states, sep=" ");
307 }
308 if (!is.null(opt$max_position_classes)) {
309 cmd = paste(cmd, "-P", opt$max_position_classes, sep=" ");
310 }
311 if (!is.null(opt$max_cell_type_clusters)) {
312 cmd = paste(cmd, "-K", opt$max_cell_type_clusters, sep=" ");
313 }
314 if (!is.null(opt$prior_concentration)) {
315 cmd = paste(cmd, "-A", opt$prior_concentration, sep=" ");
316 }
317 cmd = paste(cmd, "-sample", opt$burnin_num, opt$mcmc_num, sep=" ");
318 if (!is.null(opt$minerr)) {
319 cmd = paste(cmd, "-minerr", opt$minerr, sep=" ");
320 }
321 if (!is.null(opt$maxerr)) {
322 cmd = paste(cmd, "-maxerr", opt$maxerr, sep=" ");
323 }
324 cmd = paste(cmd, "-rseed", opt$rseed, sep=" ");
325 cmd = paste(cmd, "-thread", opt$thread, sep=" ");
326 367
327 if (is.null(opt$training_iterations)) { 368 if (is.null(opt$training_iterations)) {
328 final_cmd = paste(cmd, "-o", output_base_name, sep=" "); 369 # Not performing training.
329 final_cmd = add_output_redirect(final_cmd, opt$save_ideas_log, opt$output_log, default_log_name); 370 if (is.null(windows_by_chrom)) {
330 run_cmd(final_cmd, opt$save_ideas_log, opt$output_log, default_log_name); 371 # Not performing windows by chromosome.
372 output_name = output_base_name;
373 cmd = paste(base_cmd, "-o", output_name, sep=" ");
374 cmd = add_output_redirect(cmd, opt$save_ideas_log, opt$output_log, default_log_name);
375 run_cmd(cmd, opt$save_ideas_log, opt$output_log, default_log_name);
376 } else {
377 # Performing windows by chromosome.
378 for (i in 1:length(windows_by_chrom)) {
379 line = windows_by_chrom[i];
380 items = strsplit(line, " ")[[1]];
381 chrom = items[1];
382 window_start = items[2];
383 window_end = items[3];
384 output_name = paste(output_base_name,, chrom, sep=".");
385 cmd = paste(base_cmd, "-inv", window_start, window_end, sep=" ");
386 cmd = paste(cmd, "-o", output_name, sep=" ");
387 cmd = add_output_redirect(cmd, opt$save_ideas_log, opt$output_log, default_log_name);
388 run_cmd(cmd, opt$save_ideas_log, opt$output_log, default_log_name);
389 }
390 }
331 } else { 391 } else {
332 output_para0 = paste(output_base_name, ".para0", sep=""); 392 # performing training.
333 output_profile0 = paste(output_base_name, ".profile0", sep=""); 393 output_para0 = paste(output_base_name, "para0", sep=".");
394 output_profile0 = paste(output_base_name, "profile0", sep=".");
334 for (i in 1:opt$training_iterations) { 395 for (i in 1:opt$training_iterations) {
335 final_cmd = paste(cmd, "-o", paste(output_base_name, ".tmp.", i, sep=""), sep=" "); 396 cmd = paste(base_cmd, "-o", paste(output_base_name, ".tmp.", i, sep=""), sep=" ");
336 final_cmd = add_output_redirect(final_cmd, opt$save_ideas_log, opt$output_log, default_log_name); 397 cmd = add_output_redirect(cmd, opt$save_ideas_log, opt$output_log, default_log_name);
337 run_cmd(final_cmd, opt$save_ideas_log, opt$output_log, default_log_name); 398 run_cmd(cmd, opt$save_ideas_log, opt$output_log, default_log_name);
338 } 399 }
339 tpara = combine_state(paste(output_base_name, ".tmp.", (1:opt$training_iterations), ".para", sep=""), mycut=0.5); 400 tpara = combine_state(paste(output_base_name, "tmp", (1:opt$training_iterations), "para", sep="."), mycut=0.5);
401 write.table(tpara$profile, output_profile0, quote=F, row.names=F, col.names=F);
340 para = tpara$para; 402 para = tpara$para;
341 write.table(tpara$profile, output_profile0, quote=F, row.names=F, col.names=F);
342 para = apply(para, 1, function(x){paste(x, collapse=" ")}); 403 para = apply(para, 1, function(x){paste(x, collapse=" ")});
343 para = c(readLines(paste(output_base_name, ".tmp.1.para", sep=""), n=1), para); 404 para = c(readLines(paste(output_base_name, "tmp", "1", "para", sep="."), n=1), para);
344 writeLines(para, output_para0); 405 writeLines(para, output_para0);
345 cmd = c(cmd, "-otherpara", output_para0, output_profile0); 406 # Now run IDEAS based on the files produced during training.
346 if (length(which(cmd == "-G")) == 0) { 407 base_cmd = get_post_training_base_cmd(base_cmd, para);
347 cmd = c(cmd, "-G", length(para)-1); 408 base_cmd = paste(base_cmd, "-otherpara", output_para0[[1]], output_profile0[[1]], sep=" ");
409 if (is.null(windows_by_chrom)) {
410 cmd = c(base_cmd, "-o", output_base_name);
411 cmd = add_output_redirect(cmd, opt$save_ideas_log, opt$output_log, default_log_name);
412 run_cmd(cmd, opt$save_ideas_log, opt$output_log, default_log_name);
348 } else { 413 } else {
349 tt = which(cmd == "-G"); 414 # Performing windows by chromosome.
350 cmd[tt + 1] = length(para)-1; 415 if (length(windows_by_chrom) == 1) {
351 } 416 output_name = paste(output_base_name, i, sep=".");
352 tt = which(cmd == '-C'); 417 cmd = c(base_cmd, "-o", output_name);
353 if(length(tt)>0) { 418 cmd = add_output_redirect(cmd, opt$save_ideas_log, opt$output_log, default_log_name);
354 cmd = cmd[-c(tt, tt+1)]; 419 run_cmd(cmd, opt$save_ideas_log, opt$output_log, default_log_name);
355 } 420 } else {
356 } 421 for (i in 1:length(windows_by_chrom)) {
422 line = windows_by_chrom[i];
423 items = strsplit(line, " ")[[1]];
424 chrom = items[[1]];
425 window_start = items[[2]];
426 window_end = items[[3]];
427 cmd = paste(base_cmd, "-inv", window_start, window_end, sep=" ");
428 output_name = paste(output_base_name, chrom, sep=".");
429 cmd = paste(cmd, "-o", output_name, sep=" ");
430 cmd = add_output_redirect(cmd, opt$save_ideas_log, opt$output_log, default_log_name);
431 run_cmd(cmd, opt$save_ideas_log, opt$output_log, default_log_name);
432 }
433 }
434 }
435 # Remove temporary outputs.
436 remove_files(path=".", pattern="tmp");
437 }