Mercurial > repos > greg > ideas2
comparison ideas.R @ 0:b785bcfe5cd0 draft default tip
Uploaded
author | greg |
---|---|
date | Mon, 12 Feb 2018 09:52:26 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:b785bcfe5cd0 |
---|---|
1 #!/usr/bin/env Rscript | |
2 | |
3 suppressPackageStartupMessages(library("data.table")) | |
4 suppressPackageStartupMessages(library("optparse")) | |
5 | |
6 option_list <- list( | |
7 make_option(c("--burnin_num"), action="store", dest="burnin_num", type="integer", help="Number of burnin steps"), | |
8 make_option(c("--bychr"), action="store_true", dest="bychr", default=FALSE, help="Output chromosomes in separate files"), | |
9 make_option(c("--chrom_bed_input"), action="store", dest="chrom_bed_input", default=NULL, help="Chromosome windows positions file"), | |
10 make_option(c("--chromosome_windows"), action="store", dest="chromosome_windows", default=NULL, help="Windows positions by chroms config file"), | |
11 make_option(c("--hp"), action="store_true", dest="hp", default=FALSE, help="Discourage state transition across chromosomes"), | |
12 make_option(c("--initial_states"), action="store", dest="initial_states", type="integer", default=NULL, help="Initial number of states"), | |
13 make_option(c("--ideas_input_config"), action="store", dest="ideas_input_config", help="IDEAS_input_config file"), | |
14 make_option(c("--log2"), action="store", dest="log2", type="double", default=NULL, help="log2 transformation"), | |
15 make_option(c("--maxerr"), action="store", dest="maxerr", type="double", default=NULL, help="Maximum standard deviation for the emission Gaussian distribution"), | |
16 make_option(c("--max_cell_type_clusters"), action="store", dest="max_cell_type_clusters", type="integer", default=NULL, help="Maximum number of cell type clusters allowed"), | |
17 make_option(c("--max_position_classes"), action="store", dest="max_position_classes", type="integer", default=NULL, help="Maximum number of position classes to be inferred"), | |
18 make_option(c("--max_states"), action="store", dest="max_states", type="double", default=NULL, help="Maximum number of states to be inferred"), | |
19 make_option(c("--mcmc_num"), action="store", dest="mcmc_num", type="integer", help="Number of maximization steps"), | |
20 make_option(c("--minerr"), action="store", dest="minerr", type="double", default=NULL, help="Minimum standard deviation for the emission Gaussian distribution"), | |
21 make_option(c("--output_dir"), action="store", dest="output_dir", help="Output directory, used only if job ends in error and process log needs saving"), | |
22 make_option(c("--output_log"), action="store", dest="output_log", default=NULL, help="Output log file path"), | |
23 make_option(c("--prior_concentration"), action="store", dest="prior_concentration", type="double", default=NULL, help="Prior concentration"), | |
24 make_option(c("--project_name"), action="store", dest="project_name", help="Outputs will have this base name"), | |
25 make_option(c("--rseed"), action="store", dest="rseed", type="integer", help="Seed for IDEAS model initialization"), | |
26 make_option(c("--save_ideas_log"), action="store", dest="save_ideas_log", default=NULL, help="Flag to save IDEAS process log"), | |
27 make_option(c("--standardize_datasets"), action="store_true", dest="standardize_datasets", default=FALSE, help="Standardize all datasets"), | |
28 make_option(c("--thread"), action="store", dest="thread", type="integer", help="Process threads"), | |
29 make_option(c("--training_iterations"), action="store", dest="training_iterations", type="integer", default=NULL, help="Number of training iterations"), | |
30 make_option(c("--training_windows"), action="store", dest="training_windows", type="integer", default=NULL, help="Number of training iterations") | |
31 ) | |
32 | |
33 parser <- OptionParser(usage="%prog [options] file", option_list=option_list) | |
34 args <- parse_args(parser, positional_arguments=TRUE) | |
35 opt <- args$options | |
36 | |
37 add_output_redirect <- function(cmd, output_log) { | |
38 new_cmd = c(cmd, "&>>", output_log); | |
39 return(paste(new_cmd, collapse=" ")); | |
40 } | |
41 | |
42 combine_state <- function(parafiles, method="ward.D", mycut=0.9, pcut=1.0) { | |
43 X = NULL; | |
44 K = NULL; | |
45 I = NULL; | |
46 myheader = NULL; | |
47 p = NULL; | |
48 for(i in 1:length(parafiles)) { | |
49 x = fread(parafiles[i]); | |
50 t = max(which(is.na(x[1,])==F)); | |
51 x = as.matrix(x[,1:t]); | |
52 if(i==1) { | |
53 myheader = colnames(x); | |
54 p = sqrt(9/4-2*(1-length(myheader))) - 3 / 2; | |
55 } | |
56 m = match(myheader[1:p+1], colnames(x)[1:p+1]); | |
57 v = NULL; | |
58 for(ii in 1:p) { | |
59 for(jj in 1:ii) { | |
60 a = max(m[ii],m[jj]); | |
61 b = min(m[ii],m[jj]); | |
62 v = c(v, a*(a+1)/2+b-a); | |
63 } | |
64 } | |
65 X = rbind(X, array(as.matrix(x[, c(1, 1+m, 1+p+v)]), dim=c(length(x) / (1+p+length(v)), 1 + p + length(v)))); | |
66 K = c(K, dim(x)[1]); | |
67 I = c(I, rep(i, dim(x)[1])); | |
68 } | |
69 N = length(parafiles); | |
70 p = (sqrt(1 + dim(X)[2] * 8) - 3) / 2; | |
71 omycut = mycut; | |
72 mycut = round(length(parafiles) * mycut); | |
73 M = array(X[,1:p+1] / X[,1], dim=c(dim(X)[1], p)); | |
74 V = array(0, dim=c(dim(X)[1] * p, p)); | |
75 for(i in 1:dim(X)[1]) { | |
76 t = (i - 1) * p; | |
77 l = 1; | |
78 for(j in 1:p) { | |
79 for(k in 1:j) { | |
80 V[t+j, k] = V[t+k, j] = X[i,1+p+l] / X[i,1] - M[i,j] * M[i,k]; | |
81 l = l + 1; | |
82 } | |
83 } | |
84 V[t+1:p,] = t(solve(chol(V[t+1:p,] + diag(1e-1,p)))); | |
85 } | |
86 D = array(0, dim=rep(dim(X)[1],2)); | |
87 for(i in 2:dim(X)[1]) { | |
88 for(j in 1:(i-1)) { | |
89 D[i,j] = D[j,i] = sqrt((sum((V[(i-1)*p+1:p,]%*%(M[i,]-M[j,]))^2) + sum((V[(j-1)*p+1:p,]%*%(M[i,]-M[j,]))^2))); | |
90 } | |
91 } | |
92 MM = NULL; | |
93 kk = NULL; | |
94 for(i in 1:N) { | |
95 t = 1:K[i]; | |
96 if(i > 1) { | |
97 t = t + sum(K[1:(i-1)]); | |
98 } | |
99 t = (1:dim(D)[1])[-t]; | |
100 h = hclust(as.dist(D[t,t]), method=method); | |
101 k = -1; | |
102 tM = NULL; | |
103 for(j in min(K):(min(length(t), max(K)*2))) { | |
104 m = cutree(h,k=j); | |
105 tt = NULL; | |
106 for(l in 1:j) { | |
107 tt[l] = length(unique(I[t[which(m==l)]])); | |
108 } | |
109 tk = length(which(tt>=mycut)); | |
110 if(tk > k) { | |
111 k = tk; | |
112 tM = make_parameter(1:j, I[t], m, mycut, X[t,]); | |
113 } else if(tk < k) { | |
114 break; | |
115 } | |
116 } | |
117 kk[i] = k; | |
118 MM = rbind(MM, cbind(i, tM)); | |
119 } | |
120 mysel = median(kk); | |
121 h = hclust(as.dist(D), method=method); | |
122 rt = rep(0, max(K)*2); | |
123 k = -1; | |
124 for(i in min(K):min(dim(D)[1], max(K)*2)) { | |
125 m = cutree(h,k=i); | |
126 tt = NULL; | |
127 for(j in 1:i) { | |
128 tt[j] = length(unique(I[which(m==j)])); | |
129 } | |
130 tk = length(which(tt>=mycut)); | |
131 if(tk==mysel | tk<k) { | |
132 break; | |
133 } | |
134 k = tk; | |
135 rt[i] = length(which(tt>=mycut)); | |
136 } | |
137 mysel = max(k,tk); | |
138 m = cutree(h, k=mysel); | |
139 nn = NULL; | |
140 for(i in 1:mysel) { | |
141 t = which(m==i); | |
142 nn[i] = sum(X[t,1]); | |
143 } | |
144 oo = order(nn, decreasing=T); | |
145 rt = make_parameter(oo, I, m, mycut, X); | |
146 onstate = max(rt[,1]) + 1; | |
147 ooo = NULL; | |
148 for(i in oo) { | |
149 t = which(m==i); | |
150 if(length(unique(I[t])) >= mycut) { | |
151 ooo = c(ooo, i); | |
152 } | |
153 } | |
154 d = NULL; | |
155 for(i in 1:N) { | |
156 d = rbind(d, compare_two(rt, MM[MM[,1]==i,-1])[1:onstate]); | |
157 } | |
158 dd = array(cutree(hclust(dist(c(d))), k=2), dim=dim(d)); | |
159 kk = table(c(dd)); | |
160 kk = which(as.integer(kk)==max(as.integer(kk)))[1]; | |
161 pp = apply(dd, 2, function(x){length(which(x!=kk))/length(x)}); | |
162 pp0 = apply(d, 2, function(x){length(which(x>0.5))/length(x)}); | |
163 pp[pp0<pp] = pp0[pp0<pp]; | |
164 t = which(pp > pcut); | |
165 if(length(t) > 0) { | |
166 j = 0; | |
167 nrt = NULL; | |
168 for(i in (1:onstate-1)[-t]) { | |
169 nrt = rbind(nrt, cbind(j, rt[rt[,1]==i,-1])); | |
170 j = j + 1; | |
171 } | |
172 rt = nrt; | |
173 ooo = ooo[-t]; | |
174 } | |
175 nrt = NULL; | |
176 for(i in 0:max(rt[,1])) { | |
177 t = which(rt[,1]==i); | |
178 nrt = rbind(nrt, apply(array(rt[t,], dim=c(length(t), dim(rt)[2])), 2, sum)[-1]); | |
179 } | |
180 rt = nrt; | |
181 colnames(rt) = myheader; | |
182 O = NULL; | |
183 Ip = NULL; | |
184 Xp = NULL; | |
185 k = 0; | |
186 for(i in 1:length(parafiles)) { | |
187 str = gsub(".para", ".profile", parafiles[i]); | |
188 p = as.matrix(read.table(str)); | |
189 u = array(0, dim=c(dim(p)[1], length(ooo))); | |
190 for(j in 1:length(ooo)) { | |
191 t = which(m[k+1:K[i]] == ooo[j]); | |
192 u[,j] = apply(array(p[,1+t], dim=c(dim(p)[1], length(t))), 1, sum); | |
193 } | |
194 k = k + K[i]; | |
195 u = u / (apply(u, 1, sum) + 1e-10); | |
196 Xp = rbind(Xp, cbind(p[,1], u)); | |
197 Ip = c(Ip, rep(i,dim(u)[1])); | |
198 } | |
199 hp = hclust(dist(((Xp[,-1]+min(1e-3, min(Xp[,-1][Xp[,-1]>0]))))), method=method); | |
200 ocut = min(mycut/2, length(parafiles)/2); | |
201 t = range(as.integer(table(Ip))); | |
202 Kp = NULL; | |
203 for(i in t[1]:(t[2]*2)) { | |
204 m = cutree(hp, k=i); | |
205 tt = table(Ip,m); | |
206 ll = apply(tt, 2, function(x){length(which(x>0))}); | |
207 Kp = c(Kp, length(which(ll>=ocut))); | |
208 } | |
209 oN = (t[1]:(t[2]*2))[which(Kp==max(Kp))[1]]; | |
210 m = cutree(hp, k=oN); | |
211 tt = table(Ip,m); | |
212 ll = apply(tt, 2, function(x){length(which(x>0))}); | |
213 tt = which(ll>=ocut); | |
214 for(i in tt) { | |
215 t = which(m==i); | |
216 O = rbind(O, c(sum(Xp[t, 1]), apply(array(Xp[t,-1]*Xp[t,1], dim=c(length(t), dim(Xp)[2]-1)), 2, sum)/sum(Xp[t, 1]))); | |
217 } | |
218 nrt = NULL; | |
219 nrt$para = rt; | |
220 nrt$profile = O; | |
221 return(nrt); | |
222 } | |
223 | |
224 compare_two <- function(n, m) { | |
225 NN = get_mean(n); | |
226 MM = get_mean(m); | |
227 p = (-3 + sqrt(9 + 8 * (dim(n)[2] - 2))) / 2; | |
228 dd = NULL; | |
229 for (i in 1:dim(NN)[1]) { | |
230 dd[i] = min(apply(array(MM[,1:p], dim=c(dim(MM)[1],p)), 1, function(x){sqrt(sum((x-NN[i,1:p])^2))})); | |
231 } | |
232 for (i in 1:dim(MM)[1]) { | |
233 dd[i+dim(NN)[1]] = min(apply(array(NN[,1:p], dim=c(dim(NN)[1],p)), 1, function(x){sqrt(sum((x-MM[i,1:p])^2))})); | |
234 } | |
235 return(dd); | |
236 } | |
237 | |
238 get_base_cmd <- function(ideas_input_config, chrom_bed_input, training_iterations, bychr, hp, standardize_datasets, log2, | |
239 max_states, initial_states, max_position_classes, max_cell_type_clusters, prior_concentration, | |
240 burnin_num, mcmc_num, minerr, maxerr, rseed, thread) { | |
241 base_cmd = paste("ideas", ideas_input_config, sep=" "); | |
242 if (!is.null(chrom_bed_input)) { | |
243 base_cmd = paste(base_cmd, chrom_bed_input, sep=" "); | |
244 } | |
245 if (!is.null(training_iterations)) { | |
246 base_cmd = paste(base_cmd, "-impute none", sep=" "); | |
247 } | |
248 if (bychr) { | |
249 base_cmd = paste(base_cmd, "-bychr", sep=" "); | |
250 } | |
251 if (hp) { | |
252 base_cmd = paste(base_cmd, "-hp", sep=" "); | |
253 } | |
254 if (standardize_datasets) { | |
255 base_cmd = paste(base_cmd, "-norm", sep=" "); | |
256 } | |
257 if (!is.null(log2)) { | |
258 base_cmd = paste(base_cmd, "-log2", log2, sep=" "); | |
259 } | |
260 if (!is.null(max_states)) { | |
261 base_cmd = paste(base_cmd, "-G", max_states, sep=" "); | |
262 } | |
263 if (!is.null(initial_states)) { | |
264 base_cmd = paste(base_cmd, "-C", initial_states, sep=" "); | |
265 } | |
266 if (!is.null(max_position_classes)) { | |
267 base_cmd = paste(base_cmd, "-P", max_position_classes, sep=" "); | |
268 } | |
269 if (!is.null(max_cell_type_clusters)) { | |
270 base_cmd = paste(base_cmd, "-K", max_cell_type_clusters, sep=" "); | |
271 } | |
272 if (!is.null(prior_concentration)) { | |
273 base_cmd = paste(base_cmd, "-A", prior_concentration, sep=" "); | |
274 } | |
275 base_cmd = paste(base_cmd, "-sample", burnin_num, mcmc_num, sep=" "); | |
276 if (!is.null(minerr)) { | |
277 base_cmd = paste(base_cmd, "-minerr", minerr, sep=" "); | |
278 } | |
279 if (!is.null(maxerr)) { | |
280 base_cmd = paste(base_cmd, "-maxerr", maxerr, sep=" "); | |
281 } | |
282 base_cmd = paste(base_cmd, "-rseed", rseed, sep=" "); | |
283 base_cmd = paste(base_cmd, "-thread", thread, sep=" "); | |
284 return(base_cmd); | |
285 } | |
286 | |
287 get_mean <- function(n) { | |
288 N = NULL; | |
289 for(i in sort(unique(n[,1]))) { | |
290 t = which(n[,1]==i); | |
291 N = rbind(N, apply(array(n[t,], dim=c(length(t), dim(n)[2])), 2, sum)[-1]); | |
292 } | |
293 NN = N[,-1] / N[,1]; | |
294 return(array(NN, dim=c(length(NN)/(dim(n)[2]-2), dim(n)[2]-2))); | |
295 } | |
296 | |
297 get_post_training_base_cmd <- function(base_cmd, para) { | |
298 # Change base_cmd due to training mode. | |
299 base_cmd_items = as.list(strsplit(base_cmd[1], split=" ", fixed=TRUE))[[1]]; | |
300 if (length(which(base_cmd_items == "-G")) == 0) { | |
301 base_cmd_items = c(base_cmd_items, "-G", length(para)-1); | |
302 } else { | |
303 tt = which(base_cmd_items == "-G"); | |
304 base_cmd_items[tt + 1] = length(para)-1; | |
305 } | |
306 tt = which(base_cmd_items == '-C'); | |
307 if(length(tt) > 0) { | |
308 base_cmd_items = base_cmd_items[-c(tt, tt+1)]; | |
309 } | |
310 base_cmd = paste(base_cmd_items, collapse=" "); | |
311 return(base_cmd); | |
312 } | |
313 | |
314 get_windows_by_chrom <- function(chromosome_windows) { | |
315 fh = file(chromosome_windows, "r"); | |
316 windows_by_chrom = readLines(fh); | |
317 close(fh); | |
318 return(windows_by_chrom); | |
319 } | |
320 | |
321 make_parameter <- function(myorder, id, mem, mycut, para) { | |
322 rt = NULL; | |
323 j = 0; | |
324 for(i in myorder) { | |
325 t = which(mem==i); | |
326 if (length(unique(id[t])) >= mycut) { | |
327 rt = rbind(rt, cbind(j, array(para[t,], dim=c(length(t), dim(para)[2])))); | |
328 j = j + 1; | |
329 } | |
330 } | |
331 return(rt); | |
332 } | |
333 | |
334 remove_files <- function(path, pattern) { | |
335 files = list.files(path=path, pattern=pattern); | |
336 for (f in files) { | |
337 unlink(f); | |
338 } | |
339 } | |
340 | |
341 run_cmd <- function(cmd, save_ideas_log, output_log, output_dir) { | |
342 rc = system(cmd); | |
343 if (rc != 0) { | |
344 if (is.null(save_ideas_log)) { | |
345 to_path = paste(output_dir, output_log, sep="/"); | |
346 file.rename(output_log, to_path); | |
347 } | |
348 quit(save="no", status=rc); | |
349 } | |
350 } | |
351 | |
352 # Initialize values. | |
353 if (is.null(opt$save_ideas_log)) { | |
354 output_log = "ideas_log.txt"; | |
355 } else { | |
356 output_log = opt$output_log; | |
357 } | |
358 if (is.null(opt$chromosome_windows)) { | |
359 windows_by_chrom = NULL; | |
360 } else { | |
361 # Read chromosome_windows.txt into memory. | |
362 windows_by_chrom = get_windows_by_chrom(opt$chromosome_windows); | |
363 } | |
364 base_cmd = get_base_cmd(opt$ideas_input_config, opt$chrom_bed_input, opt$training_iterations, opt$bychr, opt$hp, | |
365 opt$standardize_datasets, opt$log2, opt$max_states, opt$initial_states, opt$max_position_classes, | |
366 opt$max_cell_type_clusters, opt$prior_concentration, opt$burnin_num, opt$mcmc_num, opt$minerr, | |
367 opt$maxerr, opt$rseed, opt$thread); | |
368 output_base_name = opt$project_name; | |
369 # Perform analysis. | |
370 if (is.null(opt$training_iterations)) { | |
371 # Not performing training. | |
372 if (is.null(windows_by_chrom)) { | |
373 # Not performing windows by chromosome. | |
374 output_name = output_base_name; | |
375 cmd = paste(base_cmd, "-o", output_name, sep=" "); | |
376 cmd = add_output_redirect(cmd, output_log); | |
377 run_cmd(cmd, opt$save_ideas_log, output_log, opt$output_dir); | |
378 } else { | |
379 # Performing windows by chromosome. | |
380 for (i in 1:length(windows_by_chrom)) { | |
381 line = windows_by_chrom[i]; | |
382 items = strsplit(line, " ")[[1]]; | |
383 chrom = items[1]; | |
384 window_start = items[2]; | |
385 window_end = items[3]; | |
386 output_name = paste(output_base_name, chrom, sep="."); | |
387 cmd = paste(base_cmd, "-inv", window_start, window_end, sep=" "); | |
388 cmd = paste(cmd, "-o", output_name, sep=" "); | |
389 cmd = add_output_redirect(cmd, output_log); | |
390 run_cmd(cmd, opt$save_ideas_log, output_log, opt$output_dir); | |
391 } | |
392 } | |
393 } else { | |
394 # Performing training. | |
395 output_para0 = paste(output_base_name, "para0", sep="."); | |
396 output_profile0 = paste(output_base_name, "profile0", sep="."); | |
397 for (i in 1:opt$training_iterations) { | |
398 cmd = paste(base_cmd, "-o", paste(output_base_name, ".tmp.", i, sep=""), sep=" "); | |
399 cmd = add_output_redirect(cmd, output_log); | |
400 run_cmd(cmd, opt$save_ideas_log, output_log, opt$output_dir); | |
401 } | |
402 tpara = combine_state(paste(output_base_name, "tmp", (1:opt$training_iterations), "para", sep="."), mycut=0.5); | |
403 write.table(tpara$profile, output_profile0, quote=F, row.names=F, col.names=F); | |
404 para = tpara$para; | |
405 para = apply(para, 1, function(x){paste(x, collapse=" ")}); | |
406 para = c(readLines(paste(output_base_name, "tmp", "1", "para", sep="."), n=1), para); | |
407 writeLines(para, output_para0); | |
408 # Now run IDEAS based on the files produced during training. | |
409 base_cmd = get_post_training_base_cmd(base_cmd, para); | |
410 base_cmd = paste(base_cmd, "-otherpara", output_para0[[1]], output_profile0[[1]], sep=" "); | |
411 if (is.null(windows_by_chrom)) { | |
412 cmd = c(base_cmd, "-o", output_base_name); | |
413 cmd = add_output_redirect(cmd, output_log); | |
414 run_cmd(cmd, opt$save_ideas_log, output_log, opt$output_dir); | |
415 } else { | |
416 # Performing windows by chromosome. | |
417 if (length(windows_by_chrom) == 1) { | |
418 output_name = paste(output_base_name, i, sep="."); | |
419 cmd = c(base_cmd, "-o", output_name); | |
420 cmd = add_output_redirect(cmd, output_log); | |
421 run_cmd(cmd, opt$save_ideas_log, output_log, opt$output_dir); | |
422 } else { | |
423 for (i in 1:length(windows_by_chrom)) { | |
424 line = windows_by_chrom[i]; | |
425 items = strsplit(line, " ")[[1]]; | |
426 chrom = items[[1]]; | |
427 window_start = items[[2]]; | |
428 window_end = items[[3]]; | |
429 cmd = paste(base_cmd, "-inv", window_start, window_end, sep=" "); | |
430 output_name = paste(output_base_name, chrom, sep="."); | |
431 cmd = paste(cmd, "-o", output_name, sep=" "); | |
432 cmd = add_output_redirect(cmd, output_log); | |
433 run_cmd(cmd, opt$save_ideas_log, output_log, opt$output_dir); | |
434 } | |
435 } | |
436 } | |
437 # Remove temporary outputs. | |
438 remove_files(path=".", pattern="tmp"); | |
439 } |