Mercurial > repos > greg > ideas
comparison ideas.R @ 166:7d33a8157c6b draft
Uploaded
author | greg |
---|---|
date | Thu, 18 Jan 2018 13:47:50 -0500 |
parents | b140097a984e |
children | c5b77e9b36f1 |
comparison
equal
deleted
inserted
replaced
165:bb5544d1c85e | 166:7d33a8157c6b |
---|---|
26 make_option(c("--thread"), action="store", dest="thread", type="integer", help="Process threads"), | 26 make_option(c("--thread"), action="store", dest="thread", type="integer", help="Process threads"), |
27 make_option(c("--tmp_dir"), action="store", dest="tmp_dir", help="Directory of bed files"), | 27 make_option(c("--tmp_dir"), action="store", dest="tmp_dir", help="Directory of bed files"), |
28 make_option(c("--training_iterations"), action="store", dest="training_iterations", type="integer", default=NULL, help="Number of training iterations"), | 28 make_option(c("--training_iterations"), action="store", dest="training_iterations", type="integer", default=NULL, help="Number of training iterations"), |
29 make_option(c("--training_windows"), action="store", dest="training_windows", type="integer", default=NULL, help="Number of training iterations"), | 29 make_option(c("--training_windows"), action="store", dest="training_windows", type="integer", default=NULL, help="Number of training iterations"), |
30 make_option(c("--windows_bed"), action="store", dest="windows_bed", default=NULL, help="Bed file containing bed windows"), | 30 make_option(c("--windows_bed"), action="store", dest="windows_bed", default=NULL, help="Bed file containing bed windows"), |
31 make_option(c("--window_end"), action="store", dest="window_end", type="integer", default=NULL, help="Windows positions by chromosome end value"), | 31 make_option(c("--windows_config"), action="store", dest="windows_config", default=NULL, help="Windows positions by chroms config") |
32 make_option(c("--window_start"), action="store", dest="window_start", type="integer", default=NULL, help="Windows positions by chromosome start value") | |
33 ) | 32 ) |
34 | 33 |
35 parser <- OptionParser(usage="%prog [options] file", option_list=option_list) | 34 parser <- OptionParser(usage="%prog [options] file", option_list=option_list) |
36 args <- parse_args(parser, positional_arguments=TRUE) | 35 args <- parse_args(parser, positional_arguments=TRUE) |
37 opt <- args$options | 36 opt <- args$options |
38 | 37 |
39 add_output_redirect <- function(cmd, save_ideas_log, output_log, default_log_name) { | 38 add_output_redirect <- function(cmd, save_ideas_log, output_log, default_log_name) { |
40 if (is.null(save_ideas_log)) { | 39 if (is.null(save_ideas_log)) { |
41 cmd = paste(cmd, "&>>", default_log_name, sep=" "); | 40 new_cmd = c(cmd, "&>>", default_log_name); |
42 }else { | 41 }else { |
43 cmd = paste(cmd, "&>>", output_log, sep=" "); | 42 new_cmd = c(cmd, "&>>", output_log); |
44 } | 43 } |
45 return(cmd); | 44 return(paste(new_cmd, collapse=" ")); |
46 } | 45 } |
47 | 46 |
48 combine_state <- function(parafiles, method="ward.D", mycut=0.9, pcut=1.0) { | 47 combine_state <- function(parafiles, method="ward.D", mycut=0.9, pcut=1.0) { |
49 X = NULL; | 48 X = NULL; |
50 K = NULL; | 49 K = NULL; |
239 dd[i+dim(NN)[1]] = min(apply(array(NN[,1:p], dim=c(dim(NN)[1],p)), 1, function(x){sqrt(sum((x-MM[i,1:p])^2))})); | 238 dd[i+dim(NN)[1]] = min(apply(array(NN[,1:p], dim=c(dim(NN)[1],p)), 1, function(x){sqrt(sum((x-MM[i,1:p])^2))})); |
240 } | 239 } |
241 return(dd); | 240 return(dd); |
242 } | 241 } |
243 | 242 |
243 get_base_cmd <- function(prep_output_config, windows_bed, training_iterations, bychr, hp, norm, log2, | |
244 max_states, initial_states, max_position_classes, max_cell_type_clusters, prior_concentration, | |
245 burnin_num, mcmc_num, minerr, maxerr, rseed, thread) { | |
246 base_cmd = paste("ideas", prep_output_config, sep=" "); | |
247 if (!is.null(windows_bed)) { | |
248 base_cmd = paste(base_cmd, windows_bed, sep=" "); | |
249 } | |
250 if (!is.null(training_iterations)) { | |
251 base_cmd = paste(base_cmd, "-impute none", sep=" "); | |
252 } | |
253 if (bychr) { | |
254 base_cmd = paste(base_cmd, "-bychr", sep=" "); | |
255 } | |
256 if (hp) { | |
257 base_cmd = paste(base_cmd, "-hp", sep=" "); | |
258 } | |
259 if (norm) { | |
260 base_cmd = paste(base_cmd, "-norm", sep=" "); | |
261 } | |
262 if (!is.null(log2)) { | |
263 base_cmd = paste(base_cmd, "-log2", log2, sep=" "); | |
264 } | |
265 if (!is.null(max_states)) { | |
266 base_cmd = paste(base_cmd, "-G", max_states, sep=" "); | |
267 } | |
268 if (!is.null(initial_states)) { | |
269 base_cmd = paste(base_cmd, "-C", initial_states, sep=" "); | |
270 } | |
271 if (!is.null(max_position_classes)) { | |
272 base_cmd = paste(base_cmd, "-P", max_position_classes, sep=" "); | |
273 } | |
274 if (!is.null(max_cell_type_clusters)) { | |
275 base_cmd = paste(base_cmd, "-K", max_cell_type_clusters, sep=" "); | |
276 } | |
277 if (!is.null(prior_concentration)) { | |
278 base_cmd = paste(base_cmd, "-A", prior_concentration, sep=" "); | |
279 } | |
280 base_cmd = paste(base_cmd, "-sample", burnin_num, mcmc_num, sep=" "); | |
281 if (!is.null(minerr)) { | |
282 base_cmd = paste(base_cmd, "-minerr", minerr, sep=" "); | |
283 } | |
284 if (!is.null(maxerr)) { | |
285 base_cmd = paste(base_cmd, "-maxerr", maxerr, sep=" "); | |
286 } | |
287 base_cmd = paste(base_cmd, "-rseed", rseed, sep=" "); | |
288 base_cmd = paste(base_cmd, "-thread", thread, sep=" "); | |
289 return(base_cmd); | |
290 } | |
291 | |
244 get_mean <- function(n) { | 292 get_mean <- function(n) { |
245 N = NULL; | 293 N = NULL; |
246 for(i in sort(unique(n[,1]))) { | 294 for(i in sort(unique(n[,1]))) { |
247 t = which(n[,1]==i); | 295 t = which(n[,1]==i); |
248 N = rbind(N, apply(array(n[t,], dim=c(length(t), dim(n)[2])), 2, sum)[-1]); | 296 N = rbind(N, apply(array(n[t,], dim=c(length(t), dim(n)[2])), 2, sum)[-1]); |
249 } | 297 } |
250 NN = N[,-1] / N[,1]; | 298 NN = N[,-1] / N[,1]; |
251 return(array(NN, dim=c(length(NN)/(dim(n)[2]-2), dim(n)[2]-2))); | 299 return(array(NN, dim=c(length(NN)/(dim(n)[2]-2), dim(n)[2]-2))); |
300 } | |
301 | |
302 get_post_training_base_cmd <- function(base_cmd, para) { | |
303 # Change base_cmd due to training mode. | |
304 base_cmd_items = as.list(strsplit(base_cmd[1], split=" ", fixed=TRUE))[[1]]; | |
305 if (length(which(base_cmd_items == "-G")) == 0) { | |
306 base_cmd_items = c(base_cmd_items, "-G", length(para)-1); | |
307 } else { | |
308 tt = which(base_cmd_items == "-G"); | |
309 base_cmd_items[tt + 1] = length(para)-1; | |
310 } | |
311 tt = which(base_cmd_items == '-C'); | |
312 if(length(tt) > 0) { | |
313 base_cmd_items = base_cmd_items[-c(tt, tt+1)]; | |
314 } | |
315 base_cmd = paste(base_cmd_items, collapse=" "); | |
316 return(base_cmd); | |
317 } | |
318 | |
319 get_windows_by_chrom <- function(windows_config) { | |
320 if (is.null(windows_config)) { | |
321 windows_by_chrom = NULL; | |
322 } else { | |
323 fh = file(windows_config, "r"); | |
324 windows_by_chrom = readLines(fh); | |
325 close(fh); | |
326 } | |
327 return(windows_by_chrom); | |
252 } | 328 } |
253 | 329 |
254 make_parameter <- function(myorder, id, mem, mycut, para) { | 330 make_parameter <- function(myorder, id, mem, mycut, para) { |
255 rt = NULL; | 331 rt = NULL; |
256 j = 0; | 332 j = 0; |
262 } | 338 } |
263 } | 339 } |
264 return(rt); | 340 return(rt); |
265 } | 341 } |
266 | 342 |
343 remove_files <- function(path, pattern) { | |
344 files = list.files(path=path, pattern=pattern); | |
345 for (f in files) { | |
346 unlink(f); | |
347 } | |
348 } | |
349 | |
267 run_cmd <- function(cmd, save_ideas_log, output_log, default_log_name) { | 350 run_cmd <- function(cmd, save_ideas_log, output_log, default_log_name) { |
268 cat("\n\n >>>>> cmd:\n", cmd, "\n\n"); | |
269 rc = system(cmd); | 351 rc = system(cmd); |
270 if (rc != 0) { | 352 if (rc != 0) { |
271 if (is.null(save_ideas_log)) { | 353 if (is.null(save_ideas_log)) { |
272 file.rename(default_log_name, output_log); | 354 file.rename(default_log_name, output_log); |
273 } | 355 } |
274 quit(rc); | 356 quit(rc); |
275 } | 357 } |
276 } | 358 } |
277 | 359 |
278 default_log_name = "ideas_log.txt"; | 360 default_log_name = "ideas_log.txt"; |
361 windows_by_chrom = get_windows_by_chrom(opt$windows_config); | |
362 base_cmd = get_base_cmd(opt$prep_output_config, opt$windows_bed, opt$training_iterations, opt$bychr, | |
363 opt$hp, opt$norm, opt$log2, opt$max_states, opt$initial_states, opt$max_position_classes, | |
364 opt$max_cell_type_clusters, opt$prior_concentration, opt$burnin_num, opt$mcmc_num, opt$minerr, | |
365 opt$maxerr, opt$rseed, opt$thread); | |
279 output_base_name = opt$project_name; | 366 output_base_name = opt$project_name; |
280 cmd = paste("ideas", opt$prep_output_config, sep=" "); | |
281 if (!is.null(opt$windows_bed)) { | |
282 cmd = paste(cmd, opt$windows_bed, sep=" "); | |
283 } | |
284 if (!is.null(opt$training_iterations)) { | |
285 cmd = paste(cmd, "-impute none", sep=" "); | |
286 } | |
287 if (opt$bychr) { | |
288 cmd = paste(cmd, "-bychr", sep=" "); | |
289 } | |
290 if (opt$hp) { | |
291 cmd = paste(cmd, "-hp", sep=" "); | |
292 } | |
293 if (opt$norm) { | |
294 cmd = paste(cmd, "-norm", sep=" "); | |
295 } | |
296 if (!is.null(opt$window_start) && !is.null(opt$window_end)) { | |
297 cmd = paste(cmd, "-inv", opt$window_start, opt$window_end, sep=" "); | |
298 } | |
299 if (!is.null(opt$log2)) { | |
300 cmd = paste(cmd, "-log2", opt$log2, sep=" "); | |
301 } | |
302 if (!is.null(opt$max_states)) { | |
303 cmd = paste(cmd, "-G", opt$max_states, sep=" "); | |
304 } | |
305 if (!is.null(opt$initial_states)) { | |
306 cmd = paste(cmd, "-C", opt$initial_states, sep=" "); | |
307 } | |
308 if (!is.null(opt$max_position_classes)) { | |
309 cmd = paste(cmd, "-P", opt$max_position_classes, sep=" "); | |
310 } | |
311 if (!is.null(opt$max_cell_type_clusters)) { | |
312 cmd = paste(cmd, "-K", opt$max_cell_type_clusters, sep=" "); | |
313 } | |
314 if (!is.null(opt$prior_concentration)) { | |
315 cmd = paste(cmd, "-A", opt$prior_concentration, sep=" "); | |
316 } | |
317 cmd = paste(cmd, "-sample", opt$burnin_num, opt$mcmc_num, sep=" "); | |
318 if (!is.null(opt$minerr)) { | |
319 cmd = paste(cmd, "-minerr", opt$minerr, sep=" "); | |
320 } | |
321 if (!is.null(opt$maxerr)) { | |
322 cmd = paste(cmd, "-maxerr", opt$maxerr, sep=" "); | |
323 } | |
324 cmd = paste(cmd, "-rseed", opt$rseed, sep=" "); | |
325 cmd = paste(cmd, "-thread", opt$thread, sep=" "); | |
326 | 367 |
327 if (is.null(opt$training_iterations)) { | 368 if (is.null(opt$training_iterations)) { |
328 final_cmd = paste(cmd, "-o", output_base_name, sep=" "); | 369 # Not performing training. |
329 final_cmd = add_output_redirect(final_cmd, opt$save_ideas_log, opt$output_log, default_log_name); | 370 if (is.null(windows_by_chrom)) { |
330 run_cmd(final_cmd, opt$save_ideas_log, opt$output_log, default_log_name); | 371 # Not performing windows by chromosome. |
372 output_name = output_base_name; | |
373 cmd = paste(base_cmd, "-o", output_name, sep=" "); | |
374 cmd = add_output_redirect(cmd, opt$save_ideas_log, opt$output_log, default_log_name); | |
375 run_cmd(cmd, opt$save_ideas_log, opt$output_log, default_log_name); | |
376 } else { | |
377 # Performing windows by chromosome. | |
378 for (i in 1:length(windows_by_chrom)) { | |
379 line = windows_by_chrom[i]; | |
380 items = strsplit(line, " ")[[1]]; | |
381 chrom = items[1]; | |
382 window_start = items[2]; | |
383 window_end = items[3]; | |
384 output_name = paste(output_base_name,, chrom, sep="."); | |
385 cmd = paste(base_cmd, "-inv", window_start, window_end, sep=" "); | |
386 cmd = paste(cmd, "-o", output_name, sep=" "); | |
387 cmd = add_output_redirect(cmd, opt$save_ideas_log, opt$output_log, default_log_name); | |
388 run_cmd(cmd, opt$save_ideas_log, opt$output_log, default_log_name); | |
389 } | |
390 } | |
331 } else { | 391 } else { |
332 output_para0 = paste(output_base_name, ".para0", sep=""); | 392 # performing training. |
333 output_profile0 = paste(output_base_name, ".profile0", sep=""); | 393 output_para0 = paste(output_base_name, "para0", sep="."); |
394 output_profile0 = paste(output_base_name, "profile0", sep="."); | |
334 for (i in 1:opt$training_iterations) { | 395 for (i in 1:opt$training_iterations) { |
335 final_cmd = paste(cmd, "-o", paste(output_base_name, ".tmp.", i, sep=""), sep=" "); | 396 cmd = paste(base_cmd, "-o", paste(output_base_name, ".tmp.", i, sep=""), sep=" "); |
336 final_cmd = add_output_redirect(final_cmd, opt$save_ideas_log, opt$output_log, default_log_name); | 397 cmd = add_output_redirect(cmd, opt$save_ideas_log, opt$output_log, default_log_name); |
337 run_cmd(final_cmd, opt$save_ideas_log, opt$output_log, default_log_name); | 398 run_cmd(cmd, opt$save_ideas_log, opt$output_log, default_log_name); |
338 } | 399 } |
339 tpara = combine_state(paste(output_base_name, ".tmp.", (1:opt$training_iterations), ".para", sep=""), mycut=0.5); | 400 tpara = combine_state(paste(output_base_name, "tmp", (1:opt$training_iterations), "para", sep="."), mycut=0.5); |
401 write.table(tpara$profile, output_profile0, quote=F, row.names=F, col.names=F); | |
340 para = tpara$para; | 402 para = tpara$para; |
341 write.table(tpara$profile, output_profile0, quote=F, row.names=F, col.names=F); | |
342 para = apply(para, 1, function(x){paste(x, collapse=" ")}); | 403 para = apply(para, 1, function(x){paste(x, collapse=" ")}); |
343 para = c(readLines(paste(output_base_name, ".tmp.1.para", sep=""), n=1), para); | 404 para = c(readLines(paste(output_base_name, "tmp", "1", "para", sep="."), n=1), para); |
344 writeLines(para, output_para0); | 405 writeLines(para, output_para0); |
345 cmd = c(cmd, "-otherpara", output_para0, output_profile0); | 406 # Now run IDEAS based on the files produced during training. |
346 if (length(which(cmd == "-G")) == 0) { | 407 base_cmd = get_post_training_base_cmd(base_cmd, para); |
347 cmd = c(cmd, "-G", length(para)-1); | 408 base_cmd = paste(base_cmd, "-otherpara", output_para0[[1]], output_profile0[[1]], sep=" "); |
409 if (is.null(windows_by_chrom)) { | |
410 cmd = c(base_cmd, "-o", output_base_name); | |
411 cmd = add_output_redirect(cmd, opt$save_ideas_log, opt$output_log, default_log_name); | |
412 run_cmd(cmd, opt$save_ideas_log, opt$output_log, default_log_name); | |
348 } else { | 413 } else { |
349 tt = which(cmd == "-G"); | 414 # Performing windows by chromosome. |
350 cmd[tt + 1] = length(para)-1; | 415 if (length(windows_by_chrom) == 1) { |
351 } | 416 output_name = paste(output_base_name, i, sep="."); |
352 tt = which(cmd == '-C'); | 417 cmd = c(base_cmd, "-o", output_name); |
353 if(length(tt)>0) { | 418 cmd = add_output_redirect(cmd, opt$save_ideas_log, opt$output_log, default_log_name); |
354 cmd = cmd[-c(tt, tt+1)]; | 419 run_cmd(cmd, opt$save_ideas_log, opt$output_log, default_log_name); |
355 } | 420 } else { |
356 } | 421 for (i in 1:length(windows_by_chrom)) { |
422 line = windows_by_chrom[i]; | |
423 items = strsplit(line, " ")[[1]]; | |
424 chrom = items[[1]]; | |
425 window_start = items[[2]]; | |
426 window_end = items[[3]]; | |
427 cmd = paste(base_cmd, "-inv", window_start, window_end, sep=" "); | |
428 output_name = paste(output_base_name, chrom, sep="."); | |
429 cmd = paste(cmd, "-o", output_name, sep=" "); | |
430 cmd = add_output_redirect(cmd, opt$save_ideas_log, opt$output_log, default_log_name); | |
431 run_cmd(cmd, opt$save_ideas_log, opt$output_log, default_log_name); | |
432 } | |
433 } | |
434 } | |
435 # Remove temporary outputs. | |
436 remove_files(path=".", pattern="tmp"); | |
437 } |