Mercurial > repos > greg > ideas
comparison ideas.R @ 166:7d33a8157c6b draft
Uploaded
| author | greg |
|---|---|
| date | Thu, 18 Jan 2018 13:47:50 -0500 |
| parents | b140097a984e |
| children | c5b77e9b36f1 |
comparison
equal
deleted
inserted
replaced
| 165:bb5544d1c85e | 166:7d33a8157c6b |
|---|---|
| 26 make_option(c("--thread"), action="store", dest="thread", type="integer", help="Process threads"), | 26 make_option(c("--thread"), action="store", dest="thread", type="integer", help="Process threads"), |
| 27 make_option(c("--tmp_dir"), action="store", dest="tmp_dir", help="Directory of bed files"), | 27 make_option(c("--tmp_dir"), action="store", dest="tmp_dir", help="Directory of bed files"), |
| 28 make_option(c("--training_iterations"), action="store", dest="training_iterations", type="integer", default=NULL, help="Number of training iterations"), | 28 make_option(c("--training_iterations"), action="store", dest="training_iterations", type="integer", default=NULL, help="Number of training iterations"), |
| 29 make_option(c("--training_windows"), action="store", dest="training_windows", type="integer", default=NULL, help="Number of training iterations"), | 29 make_option(c("--training_windows"), action="store", dest="training_windows", type="integer", default=NULL, help="Number of training iterations"), |
| 30 make_option(c("--windows_bed"), action="store", dest="windows_bed", default=NULL, help="Bed file containing bed windows"), | 30 make_option(c("--windows_bed"), action="store", dest="windows_bed", default=NULL, help="Bed file containing bed windows"), |
| 31 make_option(c("--window_end"), action="store", dest="window_end", type="integer", default=NULL, help="Windows positions by chromosome end value"), | 31 make_option(c("--windows_config"), action="store", dest="windows_config", default=NULL, help="Windows positions by chroms config") |
| 32 make_option(c("--window_start"), action="store", dest="window_start", type="integer", default=NULL, help="Windows positions by chromosome start value") | |
| 33 ) | 32 ) |
| 34 | 33 |
| 35 parser <- OptionParser(usage="%prog [options] file", option_list=option_list) | 34 parser <- OptionParser(usage="%prog [options] file", option_list=option_list) |
| 36 args <- parse_args(parser, positional_arguments=TRUE) | 35 args <- parse_args(parser, positional_arguments=TRUE) |
| 37 opt <- args$options | 36 opt <- args$options |
| 38 | 37 |
| 39 add_output_redirect <- function(cmd, save_ideas_log, output_log, default_log_name) { | 38 add_output_redirect <- function(cmd, save_ideas_log, output_log, default_log_name) { |
| 40 if (is.null(save_ideas_log)) { | 39 if (is.null(save_ideas_log)) { |
| 41 cmd = paste(cmd, "&>>", default_log_name, sep=" "); | 40 new_cmd = c(cmd, "&>>", default_log_name); |
| 42 }else { | 41 }else { |
| 43 cmd = paste(cmd, "&>>", output_log, sep=" "); | 42 new_cmd = c(cmd, "&>>", output_log); |
| 44 } | 43 } |
| 45 return(cmd); | 44 return(paste(new_cmd, collapse=" ")); |
| 46 } | 45 } |
| 47 | 46 |
| 48 combine_state <- function(parafiles, method="ward.D", mycut=0.9, pcut=1.0) { | 47 combine_state <- function(parafiles, method="ward.D", mycut=0.9, pcut=1.0) { |
| 49 X = NULL; | 48 X = NULL; |
| 50 K = NULL; | 49 K = NULL; |
| 239 dd[i+dim(NN)[1]] = min(apply(array(NN[,1:p], dim=c(dim(NN)[1],p)), 1, function(x){sqrt(sum((x-MM[i,1:p])^2))})); | 238 dd[i+dim(NN)[1]] = min(apply(array(NN[,1:p], dim=c(dim(NN)[1],p)), 1, function(x){sqrt(sum((x-MM[i,1:p])^2))})); |
| 240 } | 239 } |
| 241 return(dd); | 240 return(dd); |
| 242 } | 241 } |
| 243 | 242 |
| 243 get_base_cmd <- function(prep_output_config, windows_bed, training_iterations, bychr, hp, norm, log2, | |
| 244 max_states, initial_states, max_position_classes, max_cell_type_clusters, prior_concentration, | |
| 245 burnin_num, mcmc_num, minerr, maxerr, rseed, thread) { | |
| 246 base_cmd = paste("ideas", prep_output_config, sep=" "); | |
| 247 if (!is.null(windows_bed)) { | |
| 248 base_cmd = paste(base_cmd, windows_bed, sep=" "); | |
| 249 } | |
| 250 if (!is.null(training_iterations)) { | |
| 251 base_cmd = paste(base_cmd, "-impute none", sep=" "); | |
| 252 } | |
| 253 if (bychr) { | |
| 254 base_cmd = paste(base_cmd, "-bychr", sep=" "); | |
| 255 } | |
| 256 if (hp) { | |
| 257 base_cmd = paste(base_cmd, "-hp", sep=" "); | |
| 258 } | |
| 259 if (norm) { | |
| 260 base_cmd = paste(base_cmd, "-norm", sep=" "); | |
| 261 } | |
| 262 if (!is.null(log2)) { | |
| 263 base_cmd = paste(base_cmd, "-log2", log2, sep=" "); | |
| 264 } | |
| 265 if (!is.null(max_states)) { | |
| 266 base_cmd = paste(base_cmd, "-G", max_states, sep=" "); | |
| 267 } | |
| 268 if (!is.null(initial_states)) { | |
| 269 base_cmd = paste(base_cmd, "-C", initial_states, sep=" "); | |
| 270 } | |
| 271 if (!is.null(max_position_classes)) { | |
| 272 base_cmd = paste(base_cmd, "-P", max_position_classes, sep=" "); | |
| 273 } | |
| 274 if (!is.null(max_cell_type_clusters)) { | |
| 275 base_cmd = paste(base_cmd, "-K", max_cell_type_clusters, sep=" "); | |
| 276 } | |
| 277 if (!is.null(prior_concentration)) { | |
| 278 base_cmd = paste(base_cmd, "-A", prior_concentration, sep=" "); | |
| 279 } | |
| 280 base_cmd = paste(base_cmd, "-sample", burnin_num, mcmc_num, sep=" "); | |
| 281 if (!is.null(minerr)) { | |
| 282 base_cmd = paste(base_cmd, "-minerr", minerr, sep=" "); | |
| 283 } | |
| 284 if (!is.null(maxerr)) { | |
| 285 base_cmd = paste(base_cmd, "-maxerr", maxerr, sep=" "); | |
| 286 } | |
| 287 base_cmd = paste(base_cmd, "-rseed", rseed, sep=" "); | |
| 288 base_cmd = paste(base_cmd, "-thread", thread, sep=" "); | |
| 289 return(base_cmd); | |
| 290 } | |
| 291 | |
| 244 get_mean <- function(n) { | 292 get_mean <- function(n) { |
| 245 N = NULL; | 293 N = NULL; |
| 246 for(i in sort(unique(n[,1]))) { | 294 for(i in sort(unique(n[,1]))) { |
| 247 t = which(n[,1]==i); | 295 t = which(n[,1]==i); |
| 248 N = rbind(N, apply(array(n[t,], dim=c(length(t), dim(n)[2])), 2, sum)[-1]); | 296 N = rbind(N, apply(array(n[t,], dim=c(length(t), dim(n)[2])), 2, sum)[-1]); |
| 249 } | 297 } |
| 250 NN = N[,-1] / N[,1]; | 298 NN = N[,-1] / N[,1]; |
| 251 return(array(NN, dim=c(length(NN)/(dim(n)[2]-2), dim(n)[2]-2))); | 299 return(array(NN, dim=c(length(NN)/(dim(n)[2]-2), dim(n)[2]-2))); |
| 300 } | |
| 301 | |
| 302 get_post_training_base_cmd <- function(base_cmd, para) { | |
| 303 # Change base_cmd due to training mode. | |
| 304 base_cmd_items = as.list(strsplit(base_cmd[1], split=" ", fixed=TRUE))[[1]]; | |
| 305 if (length(which(base_cmd_items == "-G")) == 0) { | |
| 306 base_cmd_items = c(base_cmd_items, "-G", length(para)-1); | |
| 307 } else { | |
| 308 tt = which(base_cmd_items == "-G"); | |
| 309 base_cmd_items[tt + 1] = length(para)-1; | |
| 310 } | |
| 311 tt = which(base_cmd_items == '-C'); | |
| 312 if(length(tt) > 0) { | |
| 313 base_cmd_items = base_cmd_items[-c(tt, tt+1)]; | |
| 314 } | |
| 315 base_cmd = paste(base_cmd_items, collapse=" "); | |
| 316 return(base_cmd); | |
| 317 } | |
| 318 | |
| 319 get_windows_by_chrom <- function(windows_config) { | |
| 320 if (is.null(windows_config)) { | |
| 321 windows_by_chrom = NULL; | |
| 322 } else { | |
| 323 fh = file(windows_config, "r"); | |
| 324 windows_by_chrom = readLines(fh); | |
| 325 close(fh); | |
| 326 } | |
| 327 return(windows_by_chrom); | |
| 252 } | 328 } |
| 253 | 329 |
| 254 make_parameter <- function(myorder, id, mem, mycut, para) { | 330 make_parameter <- function(myorder, id, mem, mycut, para) { |
| 255 rt = NULL; | 331 rt = NULL; |
| 256 j = 0; | 332 j = 0; |
| 262 } | 338 } |
| 263 } | 339 } |
| 264 return(rt); | 340 return(rt); |
| 265 } | 341 } |
| 266 | 342 |
| 343 remove_files <- function(path, pattern) { | |
| 344 files = list.files(path=path, pattern=pattern); | |
| 345 for (f in files) { | |
| 346 unlink(f); | |
| 347 } | |
| 348 } | |
| 349 | |
| 267 run_cmd <- function(cmd, save_ideas_log, output_log, default_log_name) { | 350 run_cmd <- function(cmd, save_ideas_log, output_log, default_log_name) { |
| 268 cat("\n\n >>>>> cmd:\n", cmd, "\n\n"); | |
| 269 rc = system(cmd); | 351 rc = system(cmd); |
| 270 if (rc != 0) { | 352 if (rc != 0) { |
| 271 if (is.null(save_ideas_log)) { | 353 if (is.null(save_ideas_log)) { |
| 272 file.rename(default_log_name, output_log); | 354 file.rename(default_log_name, output_log); |
| 273 } | 355 } |
| 274 quit(rc); | 356 quit(rc); |
| 275 } | 357 } |
| 276 } | 358 } |
| 277 | 359 |
| 278 default_log_name = "ideas_log.txt"; | 360 default_log_name = "ideas_log.txt"; |
| 361 windows_by_chrom = get_windows_by_chrom(opt$windows_config); | |
| 362 base_cmd = get_base_cmd(opt$prep_output_config, opt$windows_bed, opt$training_iterations, opt$bychr, | |
| 363 opt$hp, opt$norm, opt$log2, opt$max_states, opt$initial_states, opt$max_position_classes, | |
| 364 opt$max_cell_type_clusters, opt$prior_concentration, opt$burnin_num, opt$mcmc_num, opt$minerr, | |
| 365 opt$maxerr, opt$rseed, opt$thread); | |
| 279 output_base_name = opt$project_name; | 366 output_base_name = opt$project_name; |
| 280 cmd = paste("ideas", opt$prep_output_config, sep=" "); | |
| 281 if (!is.null(opt$windows_bed)) { | |
| 282 cmd = paste(cmd, opt$windows_bed, sep=" "); | |
| 283 } | |
| 284 if (!is.null(opt$training_iterations)) { | |
| 285 cmd = paste(cmd, "-impute none", sep=" "); | |
| 286 } | |
| 287 if (opt$bychr) { | |
| 288 cmd = paste(cmd, "-bychr", sep=" "); | |
| 289 } | |
| 290 if (opt$hp) { | |
| 291 cmd = paste(cmd, "-hp", sep=" "); | |
| 292 } | |
| 293 if (opt$norm) { | |
| 294 cmd = paste(cmd, "-norm", sep=" "); | |
| 295 } | |
| 296 if (!is.null(opt$window_start) && !is.null(opt$window_end)) { | |
| 297 cmd = paste(cmd, "-inv", opt$window_start, opt$window_end, sep=" "); | |
| 298 } | |
| 299 if (!is.null(opt$log2)) { | |
| 300 cmd = paste(cmd, "-log2", opt$log2, sep=" "); | |
| 301 } | |
| 302 if (!is.null(opt$max_states)) { | |
| 303 cmd = paste(cmd, "-G", opt$max_states, sep=" "); | |
| 304 } | |
| 305 if (!is.null(opt$initial_states)) { | |
| 306 cmd = paste(cmd, "-C", opt$initial_states, sep=" "); | |
| 307 } | |
| 308 if (!is.null(opt$max_position_classes)) { | |
| 309 cmd = paste(cmd, "-P", opt$max_position_classes, sep=" "); | |
| 310 } | |
| 311 if (!is.null(opt$max_cell_type_clusters)) { | |
| 312 cmd = paste(cmd, "-K", opt$max_cell_type_clusters, sep=" "); | |
| 313 } | |
| 314 if (!is.null(opt$prior_concentration)) { | |
| 315 cmd = paste(cmd, "-A", opt$prior_concentration, sep=" "); | |
| 316 } | |
| 317 cmd = paste(cmd, "-sample", opt$burnin_num, opt$mcmc_num, sep=" "); | |
| 318 if (!is.null(opt$minerr)) { | |
| 319 cmd = paste(cmd, "-minerr", opt$minerr, sep=" "); | |
| 320 } | |
| 321 if (!is.null(opt$maxerr)) { | |
| 322 cmd = paste(cmd, "-maxerr", opt$maxerr, sep=" "); | |
| 323 } | |
| 324 cmd = paste(cmd, "-rseed", opt$rseed, sep=" "); | |
| 325 cmd = paste(cmd, "-thread", opt$thread, sep=" "); | |
| 326 | 367 |
| 327 if (is.null(opt$training_iterations)) { | 368 if (is.null(opt$training_iterations)) { |
| 328 final_cmd = paste(cmd, "-o", output_base_name, sep=" "); | 369 # Not performing training. |
| 329 final_cmd = add_output_redirect(final_cmd, opt$save_ideas_log, opt$output_log, default_log_name); | 370 if (is.null(windows_by_chrom)) { |
| 330 run_cmd(final_cmd, opt$save_ideas_log, opt$output_log, default_log_name); | 371 # Not performing windows by chromosome. |
| 372 output_name = output_base_name; | |
| 373 cmd = paste(base_cmd, "-o", output_name, sep=" "); | |
| 374 cmd = add_output_redirect(cmd, opt$save_ideas_log, opt$output_log, default_log_name); | |
| 375 run_cmd(cmd, opt$save_ideas_log, opt$output_log, default_log_name); | |
| 376 } else { | |
| 377 # Performing windows by chromosome. | |
| 378 for (i in 1:length(windows_by_chrom)) { | |
| 379 line = windows_by_chrom[i]; | |
| 380 items = strsplit(line, " ")[[1]]; | |
| 381 chrom = items[1]; | |
| 382 window_start = items[2]; | |
| 383 window_end = items[3]; | |
| 384 output_name = paste(output_base_name,, chrom, sep="."); | |
| 385 cmd = paste(base_cmd, "-inv", window_start, window_end, sep=" "); | |
| 386 cmd = paste(cmd, "-o", output_name, sep=" "); | |
| 387 cmd = add_output_redirect(cmd, opt$save_ideas_log, opt$output_log, default_log_name); | |
| 388 run_cmd(cmd, opt$save_ideas_log, opt$output_log, default_log_name); | |
| 389 } | |
| 390 } | |
| 331 } else { | 391 } else { |
| 332 output_para0 = paste(output_base_name, ".para0", sep=""); | 392 # performing training. |
| 333 output_profile0 = paste(output_base_name, ".profile0", sep=""); | 393 output_para0 = paste(output_base_name, "para0", sep="."); |
| 394 output_profile0 = paste(output_base_name, "profile0", sep="."); | |
| 334 for (i in 1:opt$training_iterations) { | 395 for (i in 1:opt$training_iterations) { |
| 335 final_cmd = paste(cmd, "-o", paste(output_base_name, ".tmp.", i, sep=""), sep=" "); | 396 cmd = paste(base_cmd, "-o", paste(output_base_name, ".tmp.", i, sep=""), sep=" "); |
| 336 final_cmd = add_output_redirect(final_cmd, opt$save_ideas_log, opt$output_log, default_log_name); | 397 cmd = add_output_redirect(cmd, opt$save_ideas_log, opt$output_log, default_log_name); |
| 337 run_cmd(final_cmd, opt$save_ideas_log, opt$output_log, default_log_name); | 398 run_cmd(cmd, opt$save_ideas_log, opt$output_log, default_log_name); |
| 338 } | 399 } |
| 339 tpara = combine_state(paste(output_base_name, ".tmp.", (1:opt$training_iterations), ".para", sep=""), mycut=0.5); | 400 tpara = combine_state(paste(output_base_name, "tmp", (1:opt$training_iterations), "para", sep="."), mycut=0.5); |
| 401 write.table(tpara$profile, output_profile0, quote=F, row.names=F, col.names=F); | |
| 340 para = tpara$para; | 402 para = tpara$para; |
| 341 write.table(tpara$profile, output_profile0, quote=F, row.names=F, col.names=F); | |
| 342 para = apply(para, 1, function(x){paste(x, collapse=" ")}); | 403 para = apply(para, 1, function(x){paste(x, collapse=" ")}); |
| 343 para = c(readLines(paste(output_base_name, ".tmp.1.para", sep=""), n=1), para); | 404 para = c(readLines(paste(output_base_name, "tmp", "1", "para", sep="."), n=1), para); |
| 344 writeLines(para, output_para0); | 405 writeLines(para, output_para0); |
| 345 cmd = c(cmd, "-otherpara", output_para0, output_profile0); | 406 # Now run IDEAS based on the files produced during training. |
| 346 if (length(which(cmd == "-G")) == 0) { | 407 base_cmd = get_post_training_base_cmd(base_cmd, para); |
| 347 cmd = c(cmd, "-G", length(para)-1); | 408 base_cmd = paste(base_cmd, "-otherpara", output_para0[[1]], output_profile0[[1]], sep=" "); |
| 409 if (is.null(windows_by_chrom)) { | |
| 410 cmd = c(base_cmd, "-o", output_base_name); | |
| 411 cmd = add_output_redirect(cmd, opt$save_ideas_log, opt$output_log, default_log_name); | |
| 412 run_cmd(cmd, opt$save_ideas_log, opt$output_log, default_log_name); | |
| 348 } else { | 413 } else { |
| 349 tt = which(cmd == "-G"); | 414 # Performing windows by chromosome. |
| 350 cmd[tt + 1] = length(para)-1; | 415 if (length(windows_by_chrom) == 1) { |
| 351 } | 416 output_name = paste(output_base_name, i, sep="."); |
| 352 tt = which(cmd == '-C'); | 417 cmd = c(base_cmd, "-o", output_name); |
| 353 if(length(tt)>0) { | 418 cmd = add_output_redirect(cmd, opt$save_ideas_log, opt$output_log, default_log_name); |
| 354 cmd = cmd[-c(tt, tt+1)]; | 419 run_cmd(cmd, opt$save_ideas_log, opt$output_log, default_log_name); |
| 355 } | 420 } else { |
| 356 } | 421 for (i in 1:length(windows_by_chrom)) { |
| 422 line = windows_by_chrom[i]; | |
| 423 items = strsplit(line, " ")[[1]]; | |
| 424 chrom = items[[1]]; | |
| 425 window_start = items[[2]]; | |
| 426 window_end = items[[3]]; | |
| 427 cmd = paste(base_cmd, "-inv", window_start, window_end, sep=" "); | |
| 428 output_name = paste(output_base_name, chrom, sep="."); | |
| 429 cmd = paste(cmd, "-o", output_name, sep=" "); | |
| 430 cmd = add_output_redirect(cmd, opt$save_ideas_log, opt$output_log, default_log_name); | |
| 431 run_cmd(cmd, opt$save_ideas_log, opt$output_log, default_log_name); | |
| 432 } | |
| 433 } | |
| 434 } | |
| 435 # Remove temporary outputs. | |
| 436 remove_files(path=".", pattern="tmp"); | |
| 437 } |
