0
|
1 #!/user/bin/env Rscript
|
|
2
|
|
3 suppressPackageStartupMessages(library(igraph))
|
|
4 suppressPackageStartupMessages(library(parallel))
|
|
5 suppressPackageStartupMessages(library(Biostrings))
|
|
6 suppressPackageStartupMessages(library(scales))
|
|
7 suppressPackageStartupMessages(library(stringr))
|
|
8 suppressPackageStartupMessages(library(hwriter))
|
|
9 suppressPackageStartupMessages(library(R2HTML))
|
|
10 suppressPackageStartupMessages(library(plyr))
|
|
11 suppressPackageStartupMessages(library(dplyr))
|
|
12
|
|
13 max_ORF_length = function(s) {
|
|
14 ## check all frames
|
|
15 L = 0
|
|
16 for (i in 1:3) {
|
|
17 L = max(L, nchar(unlist(strsplit(as.character(translate(subseq(s, i))), "*",
|
|
18 fixed = TRUE))))
|
|
19 L = max(L, nchar(unlist(strsplit(as.character(translate(subseq(reverseComplement(s),
|
|
20 i))), "*", fixed = TRUE))))
|
|
21 }
|
|
22 return(L)
|
|
23 }
|
|
24
|
|
25 kmers2graph = function(kmers, mode = "strong", prop = NULL) {
|
|
26 kmerLength = nchar(kmers[1, 1])
|
|
27 if (ncol(kmers) == 2) {
|
|
28 kmers$size = kmers[, 2]/sum(kmers[, 2])
|
|
29 }
|
|
30 colnames(kmers) = c("name", "count", "size")
|
|
31 if (!is.null(prop)) { # tohle se nepouziva(prop je null), a je to asi spatne - filtuje se to pred tridenim!!
|
|
32 p = cumsum(kmers$size)
|
|
33 kmers = kmers[p < prop, ]
|
|
34 }
|
|
35 N = dim(kmers)[1]
|
|
36 kmers = kmers[order(kmers$size), ]
|
|
37 ## convert kmers to fasta file
|
|
38 kms = data.frame(kmer = substring(kmers$name, 1, kmerLength - 1), ids = 1:nrow(kmers),stringsAsFactors = FALSE)
|
|
39 kme = data.frame(kmer = substring(kmers$name, 2), ide = 1:nrow(kmers), stringsAsFactors = FALSE)
|
|
40
|
|
41 ## df = merge(kms,kme, by = 'kmer',all=FALSE)[,c(2,3,1)]
|
|
42 df = inner_join(kme,kms, by = 'kmer')[,c(2,3)]
|
|
43
|
|
44 ## names(kms) = seq_along(kms)
|
|
45 ## kme = substring(kmers$name, 2)
|
|
46 ## names(kme) = seq_along(kme)
|
|
47 ## ## use new blast!
|
|
48 ## database = tempfile()
|
|
49 ## query = tempfile()
|
|
50 ## output = tempfile()
|
|
51 ## writeXStringSet(DNAStringSet(kms), filepath = database, format = "fasta")
|
|
52 ## writeXStringSet(DNAStringSet(kme), filepath = query, format = "fasta")
|
|
53 ## cmd = paste("makeblastdb -in", database, "-dbtype nucl")
|
|
54 ## system(cmd, ignore.stdout = TRUE)
|
|
55 ## cmd = paste("blastn -outfmt '6 qseqid sseqid pident' -strand plus -dust no -perc_identity 100 -query ",
|
|
56 ## query, "-db", database, "-word_size", kmerLength - 1, "-out", output)
|
|
57 ## system(cmd)
|
|
58 ## df = try({
|
|
59 ## read.table(output, as.is = TRUE)
|
|
60 ## })
|
|
61 ## if (class(df) == "try-error"){
|
|
62 ## print("creation of kmer graph failed")
|
|
63 ## print(query)
|
|
64 ## print(output)
|
|
65 ## print(database)
|
|
66 ## return(NULL)
|
|
67 ## }
|
|
68 ## unlink(query)
|
|
69 ## unlink(paste(database, "*", sep = ""))
|
|
70 ## unlist(output)
|
|
71 gm_mean = function(x, na.rm = TRUE) {
|
|
72 exp(sum(log(x[x > 0]), na.rm = na.rm)/length(x))
|
|
73 }
|
|
74
|
|
75 whg = apply(cbind(kmers[df[, 1], 2], V2 = kmers[df[, 2], 2]), 1, gm_mean)
|
|
76 G = graph.data.frame(data.frame(V1 = kmers$name[df[, 1]], V2 = kmers$name[df[,
|
|
77 2]], weight = whg), vertices = kmers[, 1:3])
|
|
78 # separate to connected components:
|
|
79 ccs = clusters(G, mode = mode)$membership
|
|
80 sel_cls = which(tabulate(ccs) > 1)
|
|
81 Gs = list()
|
|
82 for (i in seq_along(sel_cls)) {
|
|
83 Gs[[i]] = induced.subgraph(G, vids = which(ccs %in% sel_cls[i]))
|
|
84 }
|
|
85 ## reorder!!!
|
|
86 Gs = Gs[order(sapply(Gs, vcount), decreasing = TRUE)]
|
|
87 return(Gs)
|
|
88 }
|
|
89
|
|
90
|
|
91 OGDFlayout = function(G, ncol = NULL, alg = "fmmm", OGDF = getOption("OGDF")) {
|
|
92 ## is ogdf binary available?
|
|
93 if (is.null(OGDF)) {
|
|
94 OGDF = Sys.getenv("OGDF")
|
|
95 if ("" == OGDF) {
|
|
96 options(warn = -1)
|
|
97 OGDF = system("which runOGDFlayout", intern = TRUE)
|
|
98 options(warn = 0)
|
|
99 if (length(OGDF) == 0) {
|
|
100 cat("path to runOGDFlayout not found\n")
|
|
101 return(NULL)
|
|
102 }
|
|
103
|
|
104 }
|
|
105 }
|
|
106 if (is.null(ncol)) {
|
|
107 if (is.null(E(G)$weight)) {
|
|
108 el = data.frame(get.edgelist(G, names = TRUE), rep(1, ecount(G)))
|
|
109 } else {
|
|
110 el = data.frame(get.edgelist(G, names = TRUE), E(G)$weight)
|
|
111 }
|
|
112 ncol = paste(tempfile(pattern = as.character(Sys.getpid())), ".layout", sep = "")
|
|
113 write.table(el, file = ncol, row.names = FALSE, col.names = FALSE, sep = "\t",
|
|
114 quote = FALSE)
|
|
115 } else {
|
|
116 # copy ncol:
|
|
117 ncol_tmp = paste(tempfile(pattern = as.character(Sys.getpid())), ".layout",
|
|
118 sep = "")
|
|
119 file.copy(ncol, ncol_tmp)
|
|
120 ncol = ncol_tmp
|
|
121 }
|
|
122 algopt = c("fmmm", "sm", "fme")
|
|
123 if (!(alg %in% c("fmmm", "sm", "fme") && TRUE)) {
|
|
124 stop("alg must by :", algopt, "\n")
|
|
125
|
|
126 }
|
|
127
|
|
128 # output file:
|
|
129 Louts = list()
|
|
130 layout_file = tempfile(pattern = as.character(Sys.getpid()))
|
|
131 for (i in alg) {
|
|
132 cmd = paste(OGDF, "-alg", i, "-iff layout -off layout -out", layout_file,
|
|
133 ncol)
|
|
134 system(cmd, intern = TRUE)
|
|
135 L = read.table(layout_file, skip = ecount(G))
|
|
136 L = L[match(V(G)$name, L$V2), ]
|
|
137 Lout = as.matrix(L[, -(1:2)])
|
|
138 unlink(layout_file)
|
|
139 Louts[[i]] = Lout
|
|
140
|
|
141 }
|
|
142 # clean up
|
|
143 unlink(ncol)
|
|
144 return(Louts)
|
|
145 }
|
|
146
|
|
147 xcolor_code = c(A = "#FF0000", C = "#00FF00", T = "#0000FF", G = "#FFFF00")
|
|
148
|
|
149 kmers2color = function(s, position = NULL) {
|
|
150 if (is.null(position)) {
|
|
151 position = round(nchar(s[1])/2, 0)
|
|
152 ## position = 1
|
|
153 position = nchar(s[1])
|
|
154 }
|
|
155 color_code = c(A = "#FF0000", C = "#00FF00", T = "#0000FF", G = "#FFFF00")
|
|
156 color_base = substring(s, position, position)
|
|
157 colors = color_code[color_base]
|
|
158 names(colors) = color_base
|
|
159 return(colors)
|
|
160 }
|
|
161 get_sequence = function(g, v, position = NULL) {
|
|
162 s = V(g)$name
|
|
163 if (is.null(position)) {
|
|
164 position = round(nchar(s[1])/2, 0)
|
|
165 ## position = 1
|
|
166 position = nchar(s[1])
|
|
167 }
|
|
168 nt = paste(substring(s[v], position, position), collapse = "")
|
|
169 return(nt)
|
|
170 }
|
|
171
|
|
172
|
|
173 get_mimimal_cc = function(km, thr = 20, min_coverage = 0.45, step = 2, start = NULL) {
|
|
174 if (is.null(start)) {
|
|
175 i = sum(cumsum(km$freq) < 0.5)
|
|
176 } else {
|
|
177 i = sum(cumsum(km$freq) < start)
|
|
178 }
|
|
179 continue = TRUE
|
|
180 while (continue) {
|
|
181 if (i > nrow(km)) {
|
|
182 i = nrow(km)
|
|
183 continue = FALSE
|
|
184 step = 1
|
|
185 }
|
|
186 GG = kmers2graph(km[1:i, ])
|
|
187 if (length(GG) > 0) {
|
|
188 if (vcount(GG[[1]]) > thr) {
|
|
189 if (sum(V(GG[[1]])$size) >= min_coverage) {
|
|
190 GG[[1]]$input_coverage = sum(km$freq[1:i])
|
|
191 GG[[1]]$L = OGDFlayout(GG[[1]])[[1]]
|
|
192 return(GG[[1]])
|
|
193 }
|
|
194 }
|
|
195 }
|
|
196 i = round(i * step)
|
|
197
|
|
198 }
|
|
199 if (length(GG) == 0 | is.null(GG)) {
|
|
200 return(NULL)
|
|
201 }
|
|
202
|
|
203 GG[[1]]$input_coverage = sum(km$freq[1:i])
|
|
204 GG[[1]]$L = OGDFlayout(GG[[1]])[[1]]
|
|
205 return(GG[[1]])
|
|
206 }
|
|
207
|
|
208
|
|
209
|
|
210 paths2string = function(paths) {
|
|
211 pathstring = sapply(lapply(lapply(paths, as_ids), substring, 1, 1), paste, collapse = "")
|
|
212 return(pathstring)
|
|
213 }
|
|
214
|
|
215
|
|
216 align_paths = function(paths, G) {
|
|
217 shift = rep(NA, length(paths))
|
|
218 thr = 0 # minimal length
|
|
219 tr_paths = list()
|
|
220 Seqs = list()
|
|
221 centre_node = as.numeric(names(sort(table(unlist(paths)), decreasing = TRUE)))[[1]]
|
|
222
|
|
223 for (i in seq_along(paths)) {
|
|
224 if (centre_node %in% paths[[i]]) {
|
|
225 S = which(paths[[i]] %in% centre_node)
|
|
226 shift[i] = S
|
|
227 if (S == 1) {
|
|
228 tr_paths[[i]] = paths[[i]]
|
|
229 } else {
|
|
230 tr_paths[[i]] = c(paths[[i]][S:length(paths[[i]])], paths[[i]][1:(S -
|
|
231 1)])
|
|
232 }
|
|
233 Seqs[[i]] = get_sequence(G, tr_paths[[i]])
|
|
234 } else {
|
|
235 shift[i] = NA
|
|
236 }
|
|
237 }
|
|
238 paths_n = lapply(paths, as.numeric)
|
|
239 tr_paths_n = do.call(cbind, lapply(tr_paths, as.numeric))
|
|
240 new_shift = shift
|
|
241 for (i in which(is.na(shift))) {
|
|
242 score = numeric(length(paths_n))
|
|
243 for (S in seq_along(paths_n[[i]])) {
|
|
244 if (S == 1) {
|
|
245 path_tmp_n = paths_n[[i]]
|
|
246 } else {
|
|
247 path_tmp_n = c(paths_n[[i]][S:length(paths_n[[i]])], paths_n[[i]][1:(S -
|
|
248 1)])
|
|
249 }
|
|
250 score[S] = sum(tr_paths_n == path_tmp_n)
|
|
251 }
|
|
252 if (sum(score) != 0) {
|
|
253 S = which.max(score)
|
|
254 new_shift[i] = S
|
|
255 if (S == 1) {
|
|
256 tr_paths[[i]] = paths[[i]]
|
|
257 } else {
|
|
258 tr_paths[[i]] = c(paths[[i]][S:length(paths[[i]])], paths[[i]][1:(S -
|
|
259 1)])
|
|
260 }
|
|
261 Seqs[[i]] = get_sequence(G, tr_paths[[i]])
|
|
262 }
|
|
263 }
|
|
264 shift = new_shift
|
|
265 # try to shift based on the sequences itself
|
|
266 return(list(Seqs = Seqs[!is.na(shift)], tr_paths = tr_paths[!is.na(shift)], shift = shift[!is.na(shift)]))
|
|
267 }
|
|
268
|
|
269 make_consensus = function(paths_info, G) {
|
|
270 include = !is.na(paths_info$shift)
|
|
271 ## make alignments using mafft
|
|
272 aln = mafft(unlist(paths_info$Seqs[include]))
|
|
273 CM = calculate_consensus_matrix(aln = aln, tr_paths = paths_info$tr_paths[include],
|
|
274 G = G)
|
|
275 gaps = get_gaps_from_alignment(aln)
|
|
276 CMnorm = CM/rowSums(CM)
|
|
277 bases = colnames(CM)
|
|
278 consensus = sapply(apply(CMnorm, 1, function(x) bases[which(x > 0.2)][order(x[x >
|
|
279 0.2], decreasing = TRUE)]), paste, collapse = "")
|
|
280 consensus2 = gsub("-", "", paste0(substring(consensus, 1, 1), collapse = ""),
|
|
281 fixed = TRUE)
|
|
282 number_of_SNP = sum(rowSums(CM > 0) > 1)
|
|
283 SNP_positions = which(rowSums(CM > 0) > 1)
|
|
284 number_of_position_with_indels = sum(colSums(do.call(rbind, strsplit(as.character(aln),
|
|
285 "")) == "-") > 0)
|
|
286 indel_positions = which(colSums(do.call(rbind, strsplit(as.character(aln), "")) ==
|
|
287 "-") > 0)
|
|
288 if (length(SNP_positions) > 0) {
|
|
289 variable_sites = unlist(c(c(mapply(paste, strsplit((sapply(apply(CMnorm,
|
|
290 1, function(x) bases[which(x > 0.2)]), paste, collapse = ""))[SNP_positions],
|
|
291 ""), SNP_positions, sep = "_")), paste("-", indel_positions, sep = "_")))
|
|
292 } else {
|
|
293 variable_sites = NULL
|
|
294 }
|
|
295 variable_positions = unique(SNP_positions, indel_positions)
|
|
296 return(list(aln = aln, CM = CM, CMnorm = CMnorm, consensus = consensus, consensus2 = consensus2,
|
|
297 number_of_SNP = number_of_SNP, SNP_positions = SNP_positions, number_of_position_with_indels = number_of_position_with_indels,
|
|
298 indel_positions = indel_positions, variable_positions = variable_positions,
|
|
299 variable_sites = variable_sites, gaps = gaps))
|
|
300 }
|
|
301
|
|
302
|
|
303 estimate_monomer = function(G, weights = NULL, limit = NULL) {
|
|
304 if (is.null(G)) {
|
|
305 return(NULL)
|
|
306 }
|
|
307 ## estimate monomer from kmer based graph
|
|
308 V(G)$id = 1:vcount(G)
|
|
309 GS = induced_subgraph(G, vids = which(degree(G) == 2)) ## keep only vertices without branching
|
|
310 cls = clusters(GS)$membership
|
|
311
|
|
312
|
|
313 ids = mapply(FUN = function(x, y) x[which.max(y)], split(V(GS)$id, cls), split(V(GS)$size,
|
|
314 cls)) ## from each branch use only one vertex with larges size!
|
|
315
|
|
316
|
|
317 ids = ids[order(V(G)$size[ids], decreasing = TRUE)]
|
|
318 ids_size = V(G)$size[ids]
|
|
319 N50 = sum(cumsum(ids_size)/sum(ids_size) < 0.5)
|
|
320 if (length(ids) > 10000) {
|
|
321 ids = ids[1:N50]
|
|
322 }
|
|
323 ## use only large vertices in search! how many?
|
|
324 el = get.edgelist(G, names = FALSE)
|
|
325 node_use = numeric(vcount(G))
|
|
326 LL = numeric()
|
|
327 ## W=numeric()
|
|
328 i = 0
|
|
329 paths = list()
|
|
330 if (is.null(weights)) {
|
|
331 weights = (max(E(G)$weight) - (E(G)$weight) + 1)
|
|
332 weights = E(G)$weight^(-3)
|
|
333 }
|
|
334 included = rep(FALSE, vcount(G))
|
|
335 W_total = sum(V(G)$size)
|
|
336
|
|
337 coverage = numeric()
|
|
338 t0 = c()
|
|
339 i = 0
|
|
340 j = 0
|
|
341 for (n in ids) {
|
|
342 j = j + 1
|
|
343 t0[j] = Sys.time()
|
|
344 if (included[n]) {
|
|
345 next
|
|
346 }
|
|
347 m = which(el[, 1] %in% n)
|
|
348 i = i + 1
|
|
349 s = get.shortest.paths(G, el[m, 2], el[m, 1], weights = weights, output = "vpath")
|
|
350 included[as.numeric(s$vpath[[1]])] = TRUE
|
|
351 paths[[i]] = s$vpath[[1]]
|
|
352 LL[i] = (length(s$vpath[[1]]))
|
|
353 }
|
|
354
|
|
355 ## evaluate if paths should be divided to variants - by length and path weight
|
|
356 paths_clusters = split(paths, LL)
|
|
357 paths_clusters_tr = mclapply(paths_clusters, FUN = align_paths, G = G, mc.cores = getOption("CPU"))
|
|
358
|
|
359 ## paths_clusters_tr = lapply(paths_clusters, FUN = align_paths, G = G) consensus
|
|
360 paths_consensus = mclapply(paths_clusters_tr, make_consensus, G = G, mc.cores = getOption("CPU"))
|
|
361
|
|
362 ## evaluate weight for individual paths:
|
|
363 for (v in seq_along(paths_consensus)) {
|
|
364 p = paths_clusters_tr[[v]]$tr_paths
|
|
365 ## clean
|
|
366 p = p[!sapply(p, function(x) anyNA(x) | is.null(x))]
|
|
367 L = sapply(p, length)
|
|
368 p_groups = split(p, L)
|
|
369 w_groups = sapply(p_groups, function(x) sum(V(G)$size[unique(c(sapply(x,
|
|
370 as.numeric)))]))
|
|
371 total_score = sum(V(G)$size[unique(c(unlist(sapply(p, as.numeric))))])
|
|
372 LW = data.frame(`Length estimate` = unique(L), weight = w_groups, stringsAsFactors = FALSE)
|
|
373 LW = LW[order(LW$weight, decreasing = TRUE), ]
|
|
374 rownames(LW) = NULL
|
|
375 paths_consensus[[v]]$total_score = total_score
|
|
376 paths_consensus[[v]]$length_variant_score = LW
|
|
377
|
|
378 }
|
|
379 return(list(estimates = paths_consensus, paths = paths_clusters_tr))
|
|
380 }
|
|
381
|
|
382
|
|
383
|
|
384 detect_pbs = function(dimers_file, tRNA_database_path, reads_file, output) {
|
|
385 ## read_file contain oriented reads!
|
|
386 min_offset = 10
|
|
387 max_end_dist = 2
|
|
388 min_aln_length = 30
|
|
389 max_pbs_search = 30
|
|
390 thr_length = 12
|
|
391 end_position = 23
|
|
392 insertion_proportion_threshold <- 0.15
|
|
393 ## read_file contain oriented reads! for testing
|
|
394 if (FALSE) {
|
|
395 library(Biostrings)
|
|
396 thr_length = 12
|
|
397 end_position = 23
|
|
398 dimers_file = "consensus_dimer.fasta"
|
|
399 reads_file = "reads_oriented.fas"
|
|
400 tRNA_database_path = "/mnt/raid/users/petr/workspace/repex_tarean/databases/tRNA_database2.fasta"
|
|
401
|
|
402 }
|
|
403 ## find read which are half aligned do reference dimer
|
|
404 dimers_seq = readDNAStringSet(dimers_file)
|
|
405 cmd = paste("makeblastdb -in", dimers_file, "-dbtype nucl")
|
|
406 system(cmd, ignore.stdout = TRUE)
|
|
407 output = paste0(reads_file, "blast_out.cvs")
|
|
408 columns = c("qseqid", "qstart", "qend", "qlen", "sseqid", "sstart", "send", "sstrand",
|
|
409 "slen", "pident", "length")
|
|
410 cmd = paste("blastn -outfmt '6", paste(columns, collapse = " "), "' -perc_identity 90 -query ",
|
|
411 reads_file, "-db", dimers_file, "-word_size", 7, "-dust no -num_alignments 99999 -strand plus ",
|
|
412 "-out", output)
|
|
413 system(cmd)
|
|
414 blastdf = read.table(output, as.is = TRUE, col.names = columns, comment.char = "")
|
|
415 unlink(output)
|
|
416 blastdf = blastdf[blastdf$length >= min_aln_length, ]
|
|
417
|
|
418 ## expand two whole read to see unligned reads parts
|
|
419 blastdf_expand = blastdf
|
|
420 blastdf_expand$qstart = blastdf$qstart - (blastdf$qstart - 1)
|
|
421 blastdf_expand$sstart = blastdf$sstart - (blastdf$qstart - 1)
|
|
422 blastdf_expand$qend = blastdf$qend + (blastdf$qlen - blastdf$qend)
|
|
423 blastdf_expand$send = blastdf$send + (blastdf$qlen - blastdf$qend)
|
|
424 pS = blastdf$sstart
|
|
425 pE = blastdf$send
|
|
426 pSF = blastdf_expand$sstart
|
|
427 pEF = blastdf_expand$send
|
|
428 cond1 = pS - pSF >= min_offset & blastdf$qend >= (blastdf$qlen - max_end_dist) &
|
|
429 blastdf$sstart >= max_pbs_search
|
|
430 cond2 = pEF - pE >= min_offset & blastdf$qstart <= max_end_dist & blastdf$send <=
|
|
431 blastdf$slen - max_pbs_search
|
|
432
|
|
433 ## coverage of alignments: evaluate coverage at site with breaks, it is neccessary
|
|
434 ## that there must be at least 20% of interupted alignments at given position to
|
|
435 ## be considered for additional search
|
|
436 coverage_profiles = subject_coverage(blastdf)
|
|
437
|
|
438
|
|
439 ## extract flanking sequences - cca 50nt, it should contain tRNA sequence search
|
|
440 ## Left
|
|
441 fin = tempfile()
|
|
442 fout = tempfile()
|
|
443 scoreL = scoreR = 0
|
|
444
|
|
445
|
|
446 ## left side unique insertion sites
|
|
447 if (any(cond1)) {
|
|
448 insertion_sites = ddply(blastdf[cond1, ], .(sseqid, sstart), nrow)
|
|
449 ## check coverage
|
|
450 insertion_sites$coverage = 0
|
|
451 for (i in 1:nrow(insertion_sites)) {
|
|
452 insertion_sites$coverage[i] = coverage_profiles[[insertion_sites$sseqid[i]]][insertion_sites$sstart[[i]]]
|
|
453 }
|
|
454 insertion_OK_left <- insertion_sites[with(insertion_sites, V1/coverage >
|
|
455 insertion_proportion_threshold), ]
|
|
456
|
|
457 if (nrow(insertion_OK_left) > 0) {
|
|
458 s = ifelse(insertion_OK_left[, "sstart"] - max_pbs_search < 1, 1, insertion_OK_left[,
|
|
459 "sstart"] - max_pbs_search)
|
|
460 Lpart = subseq(dimers_seq[match(insertion_OK_left$sseqid, names(dimers_seq))],
|
|
461 s, insertion_OK_left$sstart)
|
|
462
|
|
463 ## names are CONSENSUSID__READID_position
|
|
464 names(Lpart) = paste0(names(Lpart), "__", insertion_OK_left$V1, "__",
|
|
465 insertion_OK_left$sstart)
|
|
466 ## check presence TG dinucleotide
|
|
467 TG = vcountPattern("TG", subseq(dimers_seq[match(insertion_OK_left$sseqid,
|
|
468 names(dimers_seq))], insertion_OK_left$sstart - 2, insertion_OK_left$sstart +
|
|
469 2))
|
|
470 if (any(TG > 0)) {
|
|
471 writeXStringSet(Lpart[TG > 0], filepath = fin)
|
|
472 cmd = paste("blastn -outfmt '6", paste(columns, collapse = " "),
|
|
473 "' -perc_identity 83 -query ", fin, "-db", tRNA_database_path,
|
|
474 "-word_size", 7, "-strand plus -dust no -max_target_seqs 10000 -evalue 100",
|
|
475 "-out", fout)
|
|
476
|
|
477 system(cmd, ignore.stdout = TRUE)
|
|
478 df = read.table(fout, as.is = TRUE, col.names = columns, comment.char = "")
|
|
479 filter1 = df$length >= thr_length
|
|
480 filter2 = ifelse(df$send > df$sstart, df$send, df$sstart) >= end_position
|
|
481 df_pass = df[filter1 & filter2, , drop = FALSE]
|
|
482 df_pass_L = df_pass[!duplicated(df_pass$qseqid), , drop = FALSE]
|
|
483 scoreL = get_score(df_pass_L)
|
|
484 write.table(df_pass_L, file = paste0(output, "_L.csv"), row.names = FALSE,
|
|
485 sep = "\t")
|
|
486 }
|
|
487 }
|
|
488 }
|
|
489 if (any(cond2)) {
|
|
490 ## search Right
|
|
491 insertion_sites = ddply(blastdf[cond2, ], .(sseqid, send, slen), nrow)
|
|
492 ## check coverage
|
|
493 insertion_sites$coverage = 0
|
|
494 for (i in 1:nrow(insertion_sites)) {
|
|
495 insertion_sites$coverage[i] = coverage_profiles[[insertion_sites$sseqid[i]]][insertion_sites$send[[i]]]
|
|
496 }
|
|
497 insertion_OK_right <- insertion_sites[with(insertion_sites, V1/coverage >
|
|
498 insertion_proportion_threshold), ]
|
|
499
|
|
500 if (nrow(insertion_OK_right) > 0) {
|
|
501 s = ifelse(insertion_OK_right$send + max_pbs_search > insertion_OK_right$slen,
|
|
502 insertion_OK_right$slen, insertion_OK_right$send + max_pbs_search)
|
|
503 Rpart = subseq(dimers_seq[match(insertion_OK_right$sseqid, names(dimers_seq))],
|
|
504 insertion_OK_right$send, s)
|
|
505 names(Rpart) = paste0(names(Rpart), "__", insertion_OK_right$V1, "__",
|
|
506 insertion_OK_right$send)
|
|
507
|
|
508 ## check presence CA dinucleotide
|
|
509 CA = vcountPattern("CA", subseq(dimers_seq[match(insertion_OK_right$sseqid,
|
|
510 names(dimers_seq))], insertion_OK_right$send - 2, insertion_OK_right$send +
|
|
511 2, ))
|
|
512 if (any(CA > 0)) {
|
|
513 writeXStringSet(Rpart[CA > 0], filepath = fin)
|
|
514 cmd = paste("blastn -outfmt '6", paste(columns, collapse = " "),
|
|
515 "' -perc_identity 83 -query ", fin, "-db", tRNA_database_path,
|
|
516 "-word_size", 7, "-strand minus -dust no -max_target_seqs 10000 -evalue 100",
|
|
517 "-out", fout)
|
|
518
|
|
519 system(cmd, ignore.stdout = TRUE)
|
|
520 df = read.table(fout, as.is = TRUE, col.names = columns, comment.char = "")
|
|
521 filter1 = df$length >= thr_length
|
|
522 filter2 = ifelse(df$send > df$sstart, df$send, df$sstart) >= end_position
|
|
523 df_pass = df[filter1 & filter2, , drop = FALSE]
|
|
524 df_pass_R = df_pass[!duplicated(df_pass$qseqid), , drop = FALSE]
|
|
525 write.table(df_pass_R, file = paste0(output, "_R.csv"), row.names = FALSE,
|
|
526 sep = "\t")
|
|
527 scoreR = get_score(df_pass_R)
|
|
528 }
|
|
529 }
|
|
530 }
|
|
531 unlink(fin)
|
|
532 unlink(fout)
|
|
533 return(max(scoreL, scoreR))
|
|
534 }
|
|
535
|
|
536
|
|
537 subject_coverage = function(blastdf) {
|
|
538 ## calculate coverage for all blast subjects
|
|
539 coverage_profiles <- by(blastdf, INDICES = blastdf$sseqid, FUN = function(x) {
|
|
540 as.numeric(coverage(IRanges(start = x$sstart, end = x$send)))
|
|
541 })
|
|
542 return(coverage_profiles)
|
|
543
|
|
544 }
|
|
545
|
|
546 get_score = function(x) {
|
|
547 ## keep best tRNA
|
|
548 if (nrow(x) == 0) {
|
|
549 return(0)
|
|
550 }
|
|
551 xm = data.frame(do.call(rbind, strsplit(x$qseqid, "__")), stringsAsFactors = FALSE)
|
|
552 xm$score = as.numeric(sapply(strsplit(xm[, 1], "_"), "[[", 4))
|
|
553 xm$AA = gsub("-.+$", "", gsub("^.+__", "", x$sseqid))
|
|
554 best_score = max(by(xm$score, INDICES = xm$AA, FUN = sum))
|
|
555 return(best_score)
|
|
556 }
|
|
557
|
|
558
|
|
559
|
|
560 dotter = function(seq1, seq2 = NULL, params = "") {
|
|
561 if (is.null(seq2)) {
|
|
562 seq2 = seq1
|
|
563 }
|
|
564 library(Biostrings)
|
|
565 if (class(seq1) != "DNAStringSet") {
|
|
566 seq1 = BStringSet(seq1)
|
|
567 }
|
|
568 if (class(seq2) != "DNAStringSet") {
|
|
569 seq2 = BStringSet(seq2)
|
|
570 }
|
|
571 sf1 = tempfile("seq1")
|
|
572 writeXStringSet(seq1, file = sf1)
|
|
573 sf2 = tempfile("seq2")
|
|
574 writeXStringSet(seq2, file = sf2)
|
|
575 system(paste("dotter", params, sf1, sf2), wait = FALSE)
|
|
576 Sys.sleep(2)
|
|
577 unlink(c(sf1, sf2))
|
|
578 return(NULL)
|
|
579 }
|
|
580
|
|
581
|
|
582
|
|
583 dotter2 = function(seq1, seq2 = NULL, params = NULL) {
|
|
584 if (is.null(seq2)) {
|
|
585 seq2 = seq1
|
|
586 }
|
|
587 if (is.null(params)) {
|
|
588 params = " -windowsize 30 -threshold 45 "
|
|
589 }
|
|
590 library(Biostrings)
|
|
591 if (class(seq1) != "DNAStringSet") {
|
|
592 seq1 = DNAStringSet(seq1)
|
|
593 }
|
|
594 if (class(seq2) != "DNAStringSet") {
|
|
595 seq2 = DNAStringSet(seq2)
|
|
596 }
|
|
597 L1 = nchar(seq1)
|
|
598 L2 = nchar(seq2)
|
|
599
|
|
600 tmpdat1 = tempfile()
|
|
601
|
|
602 dir.create(tmpdat1)
|
|
603 tmpdat2 = tempfile()
|
|
604 dir.create(tmpdat2)
|
|
605 oridir = getwd()
|
|
606
|
|
607
|
|
608 seq1_merged = DNAStringSet(paste(seq1, collapse = ""))
|
|
609 seq2_merged = DNAStringSet(paste(seq2, collapse = ""))
|
|
610 seq2rc_merged = reverseComplement(DNAStringSet(paste(seq2, collapse = "")))
|
|
611
|
|
612
|
|
613 sf1 = tempfile("seq1")
|
|
614 writeXStringSet(seq1_merged, filepath = sf1)
|
|
615 sf2 = tempfile("seq2")
|
|
616 sf2rc = tempfile("seq2rc")
|
|
617 writeXStringSet(seq2_merged, filepath = sf2)
|
|
618 writeXStringSet(seq2rc_merged, filepath = sf2rc)
|
|
619
|
|
620 cmd1 = paste("dotmatcher -graph data -asequence ", sf1, "-bsequence", sf2, params)
|
|
621 cmd2 = paste("dotmatcher -graph data -asequence ", sf1, "-bsequence", sf2rc,
|
|
622 params)
|
|
623 setwd(tmpdat1)
|
|
624 output1 = system(cmd1, intern = TRUE)
|
|
625 setwd(tmpdat2)
|
|
626 output2 = system(cmd2, intern = TRUE)
|
|
627 setwd(oridir)
|
|
628
|
|
629
|
|
630
|
|
631 fout1 = strsplit(tail(output1, n = 1), split = " ")[[1]][2]
|
|
632 rawdat1 = readLines(paste(tmpdat1, "/", fout1, sep = ""))
|
|
633 rawdat1 = rawdat1[1:min(grep("^Rectangle", rawdat1))]
|
|
634
|
|
635 if (length(rawdat1[grep("^Line", rawdat1)]) == 0) {
|
|
636 coord1 = NULL
|
|
637 } else {
|
|
638 coord1 = apply(sapply(strsplit(rawdat1[grep("^Line", rawdat1)], " "), "[",
|
|
639 c(3, 5, 7, 9, 11)), 1, as.numeric)
|
|
640 coord1 = matrix(coord1, ncol = 5)
|
|
641 }
|
|
642
|
|
643 fout2 = strsplit(tail(output2, n = 1), split = " ")[[1]][2]
|
|
644 rawdat2 = readLines(paste(tmpdat2, "/", fout2, sep = ""))
|
|
645 rawdat2 = rawdat2[1:min(grep("^Rectangle", rawdat2))]
|
|
646
|
|
647 if (length(rawdat2[grep("^Line", rawdat2)]) == 0) {
|
|
648 coord2 = NULL
|
|
649 } else {
|
|
650 coord2 = apply(sapply(strsplit(rawdat2[grep("^Line", rawdat2)], " "), "[",
|
|
651 c(3, 5, 7, 9, 11)), 1, as.numeric)
|
|
652 coord2 = matrix(coord2, ncol = 5)
|
|
653 }
|
|
654 unlink(sf1)
|
|
655 unlink(sf2)
|
|
656 unlink(sf2rc)
|
|
657 unlink(tmpdat1, recursive = TRUE)
|
|
658 unlink(tmpdat2, recursive = TRUE)
|
|
659
|
|
660 N1 = sum(nchar(seq1))
|
|
661 N2 = sum(nchar(seq2))
|
|
662 op = par(xaxs = "i", yaxs = "i", mar = c(5, 2, 6, 10), las = 1)
|
|
663 on.exit(par(op))
|
|
664 plot(c(1, N1), c(1, N2), type = "n", xlab = "", ylab = "", axes = FALSE)
|
|
665 if (!is.null(coord1)) {
|
|
666 segments(x0 = coord1[, 1], y0 = coord1[, 2], x1 = coord1[, 3], y1 = coord1[,
|
|
667 4])
|
|
668 }
|
|
669 if (!is.null(coord2)) {
|
|
670 segments(x0 = coord2[, 1], y0 = N2 - coord2[, 2], x1 = coord2[, 3], y1 = N2 -
|
|
671 coord2[, 4])
|
|
672 }
|
|
673 abline(v = c(0, cumsum(L1)), col = "green")
|
|
674 abline(h = c(0, cumsum(L2)), col = "green")
|
|
675 box()
|
|
676 axis(1, at = c(1, cumsum(L1))[-(length(L1) + 1)], labels = names(seq1), hadj = 0,
|
|
677 cex.axis = 0.7)
|
|
678 axis(4, at = c(1, cumsum(L2))[-(length(L2) + 1)], labels = names(seq2), hadj = 0,
|
|
679 cex.axis = 0.7)
|
|
680 invisible(list(coord1, coord2))
|
|
681 }
|
|
682
|
|
683
|
|
684 CM2sequence = function(x) {
|
|
685 bases = c(colnames(x), "N")
|
|
686 x = cbind(x, 0)
|
|
687 colnames(x) = bases
|
|
688 seqs = paste(bases[apply(x, 1, which.max)], collapse = "")
|
|
689
|
|
690 return(seqs)
|
|
691 }
|
|
692
|
|
693 ## in alingment calculate significance from kmers
|
|
694 calculate_consensus_matrix = function(aln, tr_paths, G) {
|
|
695 bases = c("A", "C", "G", "T", "-")
|
|
696 positions = lapply(strsplit(as.character(aln), split = ""), function(x) which(!x %in%
|
|
697 "-"))
|
|
698 base_matrix = do.call(rbind, strsplit(as.character(aln), split = ""))
|
|
699 weights = matrix(0, nrow = length(aln), ncol = nchar(aln))
|
|
700 kmer_rel_proportions = (V(G)$size/table(factor(unlist(tr_paths), levels = 1:vcount(G))))
|
|
701 for (i in seq_along(positions)) {
|
|
702 weights[i, positions[[i]]] = kmer_rel_proportions[tr_paths[[i]]]
|
|
703 }
|
|
704 names(kmer_rel_proportions) = V(G)$name
|
|
705 ## get weights for gaps by approximation
|
|
706 fitgaps = function(y) {
|
|
707 if (sum(y == 0) == 0) {
|
|
708 return(y)
|
|
709 } else {
|
|
710 y0 = rep(y[y != 0], 3)
|
|
711 x0 = which(rep(y, 3) != 0)
|
|
712 fitted = approx(x0, y0, xout = seq_along(rep(y, 3)), rule = 1)
|
|
713 }
|
|
714 return(fitted$y[-seq_along(y)][seq_along(y)])
|
|
715 }
|
|
716 weights_with_gaps = t(apply(weights, 1, fitgaps))
|
|
717 ## weights_with_gaps = get_gap_weights(weights, aln,G) ## this step take wey too
|
|
718 ## long time!!!-not so important TODO handle gaps differently - more effectively
|
|
719
|
|
720 ## get consensus matrix
|
|
721 CM = sapply(1:nchar(aln[[1]]), function(i) {
|
|
722 sapply(split(weights_with_gaps[, i], factor(base_matrix[, i], levels = bases)),
|
|
723 sum)
|
|
724 })
|
|
725 return(t(CM)[, 1:5])
|
|
726 }
|
|
727
|
|
728
|
|
729 get_gaps_from_alignment = function(aln) {
|
|
730 as.character(aln)
|
|
731 gaps_positions = unique(do.call(rbind, str_locate_all(as.character(aln), "-+")))
|
|
732 return(gaps_positions)
|
|
733 }
|
|
734
|
|
735 plot_kmer_graph = function(G, L = NULL, vertex.size = NULL, ord = NULL, upto = NULL,
|
|
736 highlight = NULL) {
|
|
737 if (!is.null(G$L) & is.null(L)) {
|
|
738 L = G$L
|
|
739 }
|
|
740 if (is.null(L)) {
|
|
741 ## L=layout.kamada.kawai(G)
|
|
742 L = OGDFlayout(G)[[1]]
|
|
743 }
|
|
744 clr = kmers2color(V(G)$name)
|
|
745 if (!is.null(highlight)) {
|
|
746 clr[highlight] = "#00000080"
|
|
747 }
|
|
748 if (!is.null(ord)) {
|
|
749 clr[ord[1:upto]] = paste(clr[ord[1:upto]], "30", sep = "")
|
|
750 }
|
|
751 if (is.null(vertex.size)) {
|
|
752 vertex.size = rescale(V(G)$size, to = c(0.5, 6))
|
|
753
|
|
754 }
|
|
755 plot(G, layout = L, vertex.label = "", vertex.size = vertex.size, edge.curved = FALSE,
|
|
756 vertex.color = clr, vertex.frame.color = "#00000020", edge.width = 2, edge.arrow.mode = 1,
|
|
757 edge.arrow.size = 0.2)
|
|
758
|
|
759 }
|
|
760
|
|
761 rglplot_kmer_graph = function(G, L = NULL, vertex.size = 4) {
|
|
762 if (is.null(L)) {
|
|
763 L = layout.kamada.kawai(G)
|
|
764 }
|
|
765
|
|
766 rglplot(G, layout = L, vertex.label = "", vertex.size = vertex.size, edge.curved = FALSE,
|
|
767 vertex.color = kmers2color(V(G)$name), edge.width = 2, edge.arrow.mode = 1,
|
|
768 edge.arrow.size = 0.5)
|
|
769 }
|
|
770
|
|
771
|
|
772
|
|
773
|
|
774 mafft = function(seqs, params = "--auto --thread 1 ") {
|
|
775 if (length(seqs) < 2) {
|
|
776 return(seqs)
|
|
777 }
|
|
778 infile = tempfile()
|
|
779 if (class(seqs) == "character") {
|
|
780 seqs = DNAStringSet(seqs)
|
|
781 }
|
|
782 writeXStringSet(seqs, file = infile)
|
|
783 outfile = tempfile()
|
|
784 cmd = paste("mafft --quiet --nuc ", params, infile, "2> /dev/null > ", outfile)
|
|
785 system(cmd, intern = TRUE, ignore.stderr = FALSE)
|
|
786 aln = readDNAStringSet(outfile)
|
|
787 unlink(c(outfile, infile))
|
|
788 return(aln)
|
|
789 }
|
|
790
|
|
791 mgblast = function(databaseSeq, querySeq) {
|
|
792 params = " -p 85 -W18 -UT -X40 -KT -JF -F \"m D\" -v100000000 -b100000000 -D4 -C 30 -D 30 "
|
|
793 # no dust filtering:
|
|
794 paramsDF = " -p 85 -W18 -UT -X40 -KT -JF -F \"m D\" -v100000000 -b100000000 -D4 -F F"
|
|
795
|
|
796 database = tempfile()
|
|
797 query = tempfile()
|
|
798 output = tempfile()
|
|
799 if (class(databaseSeq) == "character") {
|
|
800 database = databaseSeq
|
|
801 do_not_delete_database = TRUE
|
|
802 } else {
|
|
803 writeXStringSet(databaseSeq, filepath = database, format = "fasta")
|
|
804 do_not_delete_database = FALSE
|
|
805 }
|
|
806 if (class(querySeq) == "character") {
|
|
807 query = querySeq
|
|
808 do_not_delete_query = TRUE
|
|
809 } else {
|
|
810 writeXStringSet(querySeq, filepath = query, format = "fasta")
|
|
811 do_not_delete_query = FALSE
|
|
812 }
|
|
813 ## create database:
|
|
814 cmd = paste("formatdb -i", database, "-p F")
|
|
815 system(cmd)
|
|
816 cmd = paste("mgblast", "-d", database, "-i", query, "-o", output, params)
|
|
817 system(cmd)
|
|
818 if (file.info(output)$size == 0) {
|
|
819 # no hist, try wthou dust masker
|
|
820 cmd = paste("mgblast", "-d", database, "-i", query, "-o", output, paramsDF)
|
|
821 system(cmd)
|
|
822 }
|
|
823 blastOut = read.table(output, sep = "\t", header = FALSE, as.is = TRUE, comment.char = "")
|
|
824 unlink(output)
|
|
825 if (!do_not_delete_query) {
|
|
826 unlink(query)
|
|
827 }
|
|
828 if (!do_not_delete_database) {
|
|
829 unlink(paste(database, "*", sep = ""))
|
|
830 }
|
|
831 colnames(blastOut) = c("query", "q.length", "q.start", "q.end", "subject", "s.length",
|
|
832 "s.start", "s.end", "pid", "weight", "e.value", "strand")
|
|
833 blastOut
|
|
834 }
|
|
835
|
|
836
|
|
837 estimate_sample_size = function(NV, NE, maxv, maxe) {
|
|
838 ## density
|
|
839 d = (2 * NE)/(NV * (NV - 1))
|
|
840 eEst = (maxv * (maxv - 1) * d)/2
|
|
841 nEst = (d + sqrt(d^2 + 8 * d * maxe))/(2 * d)
|
|
842 if (eEst >= maxe) {
|
|
843 N = round(nEst)
|
|
844 E = round((N * (N - 1) * d)/2)
|
|
845
|
|
846 }
|
|
847 if (nEst >= maxv) {
|
|
848 N = maxv
|
|
849 E = round((N * (N - 1) * d)/2)
|
|
850
|
|
851 }
|
|
852 return(N)
|
|
853 }
|
|
854
|
|
855
|
|
856
|
|
857
|
|
858 mgblast2graph = function(blastfile, seqfile, graph_destination, directed_graph_destination,
|
|
859 oriented_sequences, paired = TRUE, repex = FALSE, image_file = NULL, image_file_tmb = NULL,
|
|
860 include_layout = TRUE, pair_completeness = NULL, satellite_model_path = NULL,
|
|
861 maxv = 40000, maxe = 5e+08, seqfile_full = seqfile) {
|
|
862 cat("loading blast results\n")
|
|
863 if (repex) {
|
|
864 cln = c("query", "subject", "weight", "q.length", "q.start", "q.end", "s.length", "s.start",
|
|
865 "s.end", "sign")
|
|
866 colcls = c("character", "character", "numeric", "numeric", "numeric", "numeric",
|
|
867 "numeric", "numeric", "numeric", "NULL", "NULL", "character")
|
|
868
|
|
869 } else {
|
|
870 cln = c("query", "q.length", "q.start", "q.end", "subject", "s.length", "s.start",
|
|
871 "s.end","weight", "sign")
|
|
872 colcls = c("character", "numeric", "numeric", "numeric", "character", "numeric",
|
|
873 "numeric", "numeric", "NULL", "numeric", "NULL", "character")
|
|
874 }
|
|
875 if (class(blastfile) == "data.frame") {
|
|
876 blastTable = blastfile
|
|
877 colnames(blastTable)[12] = "sign"
|
|
878 } else {
|
|
879 blastTable = read.table(blastfile, sep = "\t", as.is = TRUE, header = FALSE,
|
|
880 colClasses = colcls, comment.char = "")
|
|
881 colnames(blastTable) = cln
|
|
882 }
|
|
883 ## check for duplicates!
|
|
884 key = with(blastTable, ifelse(query > subject, paste(query, subject), paste(subject,
|
|
885 query)))
|
|
886 if (any(duplicated(key))) {
|
|
887 blastTable = blastTable[!duplicated(key), ]
|
|
888 }
|
|
889 seqs = readDNAStringSet(seqfile)
|
|
890 ## calculate pair completeness for reads before sampling
|
|
891 if (is.null(pair_completeness)) {
|
|
892 if (paired) {
|
|
893 if (seqfile_full != seqfile) {
|
|
894 seqs_full = readDNAStringSet(seqfile_full)
|
|
895 pair_counts = tabulate(table(gsub(".$", "", names(seqs_full))))
|
|
896 rm(seqs_full)
|
|
897 } else {
|
|
898 pair_counts = tabulate(table(gsub(".$", "", names(seqs))))
|
|
899
|
|
900 }
|
|
901 pair_completeness = 1 - pair_counts[1]/sum(pair_counts)
|
|
902 }else{
|
|
903 pair_completeness = 0
|
|
904 }
|
|
905 }
|
|
906 NV = length(seqs) # vertices
|
|
907 NE = nrow(blastTable) # nodes
|
|
908 if (maxv < NV | maxe < NE) {
|
|
909 ## Sample if graph is large
|
|
910 V_sample_size = estimate_sample_size(NV, NE, maxv, maxe)
|
|
911 seqs = sample(seqs, V_sample_size)
|
|
912 blastTable = blastTable[blastTable$query %in% names(seqs) & blastTable$subject %in%
|
|
913 names(seqs), ]
|
|
914 }
|
|
915
|
|
916 blastTable$sign = ifelse(blastTable$sign == "+", 1, -1)
|
|
917 vnames = unique(c(blastTable$query, blastTable$subject))
|
|
918 vindex = seq_along(vnames)
|
|
919 names(vindex) = vnames
|
|
920 ## correct orientation
|
|
921 cat("creating graph\n")
|
|
922 G = graph.data.frame(blastTable[, c("query", "subject", "weight", "sign")], directed = FALSE,
|
|
923 vertices = vnames)
|
|
924 if (include_layout) {
|
|
925 ## save temporarily modified blastTable if large
|
|
926 tmpB = tempfile()
|
|
927 save(blastTable, file = tmpB)
|
|
928 rm(blastTable)
|
|
929 ##
|
|
930 cat("graph layout calculation ")
|
|
931 if (ecount(G) > 2e+06) {
|
|
932 cat("using fruchterman reingold\n")
|
|
933 L = layout.fruchterman.reingold(G, dim = 3)
|
|
934 } else {
|
|
935 cat("using OGDF & frucherman reingold\n")
|
|
936 Ltmp = OGDFlayout(G, alg = c("fmmm"))
|
|
937 L = cbind(Ltmp[[1]][, 1:2], layout.fruchterman.reingold(G, dim = 2))
|
|
938
|
|
939 }
|
|
940
|
|
941 GL = list(G = G, L = L)
|
|
942 save(GL, file = graph_destination)
|
|
943 if (!is.null(image_file)) {
|
|
944 cat("exporting graph figure")
|
|
945 png(image_file, width = 900, height = 900, pointsize = 20)
|
|
946 plot(GL$G, layout = GL$L[, 1:2], vertex.size = 2, vertex.color = "#000000A0",
|
|
947 edge.color = "#00000040", vertex.shape = "circle", edge.curved = FALSE,
|
|
948 vertex.label = NA, vertex.frame.color = NA, edge.width = 1)
|
|
949 dev.off()
|
|
950 ## create thunmbs:
|
|
951 system(paste("convert ", image_file, " -resize 100x100 ", image_file_tmb))
|
|
952
|
|
953 }
|
|
954 rm(GL)
|
|
955 gc()
|
|
956 load(tmpB)
|
|
957 unlink(tmpB)
|
|
958 }
|
|
959
|
|
960 if (!is.connected(G)) {
|
|
961 cat("Warning - graph is not connected\n")
|
|
962 cc = clusters(G, "weak")
|
|
963 mb = as.numeric(factor(membership(cc), levels = as.numeric(names(sort(table(membership(cc)),
|
|
964 decreasing = TRUE)))))
|
|
965 selvi = which(mb == 1) # largest component
|
|
966 cat("using largest component: \nsize =", round(length(selvi)/vcount(G) *
|
|
967 100, 1), "% ;", length(selvi), "reads", "\n")
|
|
968 G = induced.subgraph(G, vids = selvi)
|
|
969 blastTable = blastTable[blastTable$query %in% V(G)$name & blastTable$subject %in%
|
|
970 V(G)$name, ]
|
|
971 }
|
|
972
|
|
973 Gmst = minimum.spanning.tree(G)
|
|
974 # create alternative trees - for case that unly suboptima solution is found
|
|
975 set.seed(123)
|
|
976 Gmst_alt = list()
|
|
977 Gmst_alt[[1]] = Gmst
|
|
978 for (i in 2:6){
|
|
979 E(G)$weight = runif(ecount(G), 0.1,1)
|
|
980 Gmst_alt[[i]] = minimum.spanning.tree(G)
|
|
981 }
|
|
982
|
|
983 rm(G)
|
|
984 gc()
|
|
985 ## six attempts to reorient reads
|
|
986 flip_names_all = list()
|
|
987 prop_of_notfit=numeric()
|
|
988 thr_fit=0.001
|
|
989 for (ii in 1:6){
|
|
990 Gmst = Gmst_alt[[ii]]
|
|
991
|
|
992
|
|
993 blastTable_mst = as.data.frame(get.edgelist(Gmst, names = TRUE))
|
|
994 colnames(blastTable_mst) = c("query", "subject")
|
|
995 blastTable_mst$sign = E(Gmst)$sign
|
|
996
|
|
997 d = dfs(Gmst, root = 1, father = TRUE, order.out = TRUE)
|
|
998 esign = E(Gmst)$sign
|
|
999 rc = rep(FALSE, vcount(Gmst))
|
|
1000 j = 0
|
|
1001 p = 0
|
|
1002 for (v in d$order[-1]) {
|
|
1003 j = j + 1
|
|
1004 f = as.numeric(d$father)[v]
|
|
1005 if (is.na(f)) {
|
|
1006 next
|
|
1007 }
|
|
1008 eid = get.edge.ids(Gmst, c(v, f))
|
|
1009 if (esign[eid] == -1) {
|
|
1010 ie = unique(c(which(blastTable_mst$query %in% V(Gmst)$name[v]), which(blastTable_mst$subject %in%
|
|
1011 V(Gmst)$name[v]))) # incident edges
|
|
1012 esign[ie] = esign[ie] * -1
|
|
1013 rc[v] = TRUE
|
|
1014 p = p + 1
|
|
1015 if (p > 50) {
|
|
1016 p = 0
|
|
1017 cat("\r")
|
|
1018 cat(j, " : ", sum(esign)/length(esign))
|
|
1019 }
|
|
1020 }
|
|
1021 }
|
|
1022 cat("\t")
|
|
1023 flip_names_all[[ii]] = flip_names = V(Gmst)$name[rc]
|
|
1024
|
|
1025 if (!exists("blastTable")) {
|
|
1026 load(tmpB)
|
|
1027 unlink(tmpB)
|
|
1028 }
|
|
1029 ## add nofit later!!
|
|
1030 s.mean = with(blastTable, (s.start + s.end)/2)
|
|
1031 s.mean_corrected = ifelse(blastTable$subject %in% flip_names, blastTable$s.length -
|
|
1032 s.mean, s.mean)
|
|
1033 q.mean = with(blastTable, (q.start + q.end)/2)
|
|
1034 q.mean_corrected = ifelse(blastTable$query %in% flip_names, blastTable$q.length -
|
|
1035 q.mean, q.mean)
|
|
1036 V1 = ifelse(s.mean_corrected > q.mean_corrected, blastTable$subject, blastTable$query)
|
|
1037 V2 = ifelse(s.mean_corrected > q.mean_corrected, blastTable$query, blastTable$subject)
|
|
1038 sign_final = ifelse(V1 %in% flip_names, -1, 1) * ifelse(V2 %in% flip_names, -1,
|
|
1039 1) * blastTable$sign
|
|
1040 nofit = unique(c(V1[sign_final == "-1"], V2[sign_final == "-1"]))
|
|
1041 prop_of_notfit[ii] = length(nofit)/vcount(Gmst)
|
|
1042 ## check if correctly oriented
|
|
1043 cat("prop notfit", prop_of_notfit[[ii]],"\n")
|
|
1044 if (prop_of_notfit[[ii]]<thr_fit){
|
|
1045 ## OK
|
|
1046 break
|
|
1047 }
|
|
1048 }
|
|
1049 if (!prop_of_notfit[[ii]]<thr_fit){
|
|
1050 ## get best solution
|
|
1051 ii_best = which.min(prop_of_notfit)
|
|
1052 if (ii != ii_best){ # if the last solution was not best, get the best
|
|
1053 flip_names = flip_names_all[[ii_best]]
|
|
1054 s.mean = with(blastTable, (s.start + s.end)/2)
|
|
1055 s.mean_corrected = ifelse(blastTable$subject %in% flip_names, blastTable$s.length -
|
|
1056 s.mean, s.mean)
|
|
1057 q.mean = with(blastTable, (q.start + q.end)/2)
|
|
1058 q.mean_corrected = ifelse(blastTable$query %in% flip_names, blastTable$q.length -
|
|
1059 q.mean, q.mean)
|
|
1060 V1 = ifelse(s.mean_corrected > q.mean_corrected, blastTable$subject, blastTable$query)
|
|
1061 V2 = ifelse(s.mean_corrected > q.mean_corrected, blastTable$query, blastTable$subject)
|
|
1062 sign_final = ifelse(V1 %in% flip_names, -1, 1) * ifelse(V2 %in% flip_names, -1,
|
|
1063 1) * blastTable$sign
|
|
1064 nofit = unique(c(V1[sign_final == "-1"], V2[sign_final == "-1"]))
|
|
1065
|
|
1066 }
|
|
1067 }
|
|
1068
|
|
1069 ## exclude all nofit
|
|
1070
|
|
1071 df2 = data.frame(V1, V2, sign = blastTable$sign, sign_final = sign_final, stringsAsFactors = FALSE)
|
|
1072 rm(blastTable)
|
|
1073 gc()
|
|
1074 vertices = data.frame(name = vnames, reverse_complement = vnames %in% flip_names)
|
|
1075 G = graph.data.frame(df2, vertices = vertices)
|
|
1076 vcount_ori = vcount(G)
|
|
1077 G = induced.subgraph(G, vids = which(!V(G)$name %in% nofit))
|
|
1078
|
|
1079 G$escore_mst = sum(esign)/length(esign)
|
|
1080 G$escore = sum(sign_final == 1)/length(sign_final)
|
|
1081 cc = clusters(G, "strong")
|
|
1082 mb = as.numeric(factor(membership(cc), levels = as.numeric(names(sort(table(membership(cc)),
|
|
1083 decreasing = TRUE)))))
|
|
1084 names(mb) = V(G)$name
|
|
1085 G$loop_index = max(cc$csize)/vcount(G)
|
|
1086 G$coverage = vcount(G)/vcount_ori
|
|
1087 ## check sign in all edges
|
|
1088 V(G)$membership = mb
|
|
1089 save(G, file = directed_graph_destination)
|
|
1090 ## remove nofit
|
|
1091 seqs = seqs[!names(seqs) %in% nofit]
|
|
1092 seqs[names(seqs) %in% flip_names] = reverseComplement(seqs[names(seqs) %in% flip_names])
|
|
1093 names(seqs) = paste(names(seqs), mb[names(seqs)])
|
|
1094 writeXStringSet(seqs, filepath = oriented_sequences)
|
|
1095 ## calculate satellite probability
|
|
1096 if (is.null(satellite_model_path)) {
|
|
1097 pSAT = isSAT = NULL
|
|
1098 } else {
|
|
1099 satellite_model = readRDS(satellite_model_path)
|
|
1100 pSAT = get_prob(G$loop_index, pair_completeness, model = satellite_model)
|
|
1101 isSAT = isSatellite(G$loop_index, pair_completeness, model = satellite_model)
|
|
1102 }
|
|
1103
|
|
1104 ## get larges cc
|
|
1105 output = list(escore = G$escore, coverage = G$coverage, escore_mts = G$escore_mst,
|
|
1106 loop_index = G$loop_index, pair_completeness = pair_completeness, graph_file = graph_destination,
|
|
1107 oriented_sequences = oriented_sequences, vcount = vcount(G), ecount = ecount(G),
|
|
1108 satellite_probability = pSAT, satellite = isSAT)
|
|
1109 ## clean up
|
|
1110 all_objects = ls()
|
|
1111 do_not_remove = "output"
|
|
1112 rm(list = all_objects[!(all_objects %in% do_not_remove)])
|
|
1113 gc(verbose = FALSE, reset = TRUE)
|
|
1114 return(list2dictionary(output))
|
|
1115 }
|
|
1116
|
|
1117 list2dictionary = function(l){
|
|
1118 dict = "{"
|
|
1119 q='"'
|
|
1120 for (i in 1:length(l)){
|
|
1121 if (class(l[[i]])=="character" | is.null(l[[i]])){
|
|
1122 q2 = "'''"
|
|
1123 }else{
|
|
1124 q2 = ''
|
|
1125 }
|
|
1126 dict = paste0(
|
|
1127 dict,
|
|
1128 q,names(l)[i],q,":",q2, l[[i]], q2,", "
|
|
1129 )
|
|
1130 }
|
|
1131 dict = paste0(dict, "}")
|
|
1132 return(dict)
|
|
1133 }
|
|
1134
|
|
1135 wrap = Vectorize(function(s, width = 80) {
|
|
1136 i1 = seq(1, nchar(s), width)
|
|
1137 i2 = seq(width, by = width, length.out = length(i1))
|
|
1138 return(paste(substring(s, i1, i2), collapse = "\n"))
|
|
1139 })
|
|
1140
|
|
1141
|
|
1142 tarean = function(input_sequences, output_dir, min_kmer_length = 11, max_kmer_length = 27,
|
|
1143 CPU = 2, sample_size = 10000, reorient_reads = TRUE, tRNA_database_path = NULL,
|
|
1144 include_layout = TRUE, paired = TRUE, lock_file=NULL) {
|
|
1145 options(CPU = CPU)
|
|
1146 time0 = Sys.time()
|
|
1147 dir.create(output_dir)
|
|
1148 input_sequences_copy = paste(output_dir, "/", basename(input_sequences), sep = "")
|
|
1149
|
|
1150 if (!file.copy(input_sequences, input_sequences_copy, overwrite = TRUE)) {
|
|
1151 cat(paste("cannot copy", input_sequences, " to", output_dir), "\n")
|
|
1152 stop()
|
|
1153 }
|
|
1154
|
|
1155 lock_file = waitForRAM(lock_file = lock_file)
|
|
1156 pair_completeness = NULL
|
|
1157 ## sampling
|
|
1158 if (sample_size != 0) {
|
|
1159 s = readDNAStringSet(input_sequences_copy)
|
|
1160 N = length(s)
|
|
1161 ## pair completness must be calculated before sampling!
|
|
1162 if (N > sample_size) {
|
|
1163 writeXStringSet(sample(s, sample_size), filepath = input_sequences_copy)
|
|
1164 if (paired) {
|
|
1165 pair_counts = tabulate(table(gsub(".$", "", names(s))))
|
|
1166 pair_completeness = 1 - pair_counts[1]/sum(pair_counts)
|
|
1167 }
|
|
1168 }
|
|
1169 rm(s)
|
|
1170 }
|
|
1171
|
|
1172 if (reorient_reads) {
|
|
1173 input_sequences_oriented = paste0(input_sequences_copy, "oriented.fasta")
|
|
1174 graph_file = paste0(input_sequences_copy, "_graph.RData")
|
|
1175 GLfile = paste0(input_sequences_copy, "_graph.GL")
|
|
1176 cat("reorientig sequences\n")
|
|
1177
|
|
1178 blastTable = mgblast(input_sequences_copy, input_sequences_copy)
|
|
1179
|
|
1180 graph_info = mgblast2graph(blastTable, input_sequences_copy, GLfile, graph_file,
|
|
1181 input_sequences_oriented, include_layout = include_layout, paired = paired,
|
|
1182 pair_completeness = pair_completeness)
|
|
1183
|
|
1184 ## interupt if it does not look like tandem repeat at all # soft threshold!
|
|
1185 if (is.null(graph_info$pair_completeness)) {
|
|
1186 if (graph_info$loop_index <= 0.4) {
|
|
1187 cat("CLUSTER DID NOT PASS THRESHOLD!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
|
|
1188 return(list2dictionary(list(graph_info = graph_info)))
|
|
1189 }
|
|
1190 } else {
|
|
1191 ## for paired reads:
|
|
1192 if (graph_info$loop_index < 0.7 | graph_info$pair_completeness < 0.4) {
|
|
1193 cat("CLUSTER DID NOT PASS THRESHOLD!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
|
|
1194 return(list2dictionary(list(graph_info = graph_info)))
|
|
1195 }
|
|
1196 }
|
|
1197
|
|
1198 } else {
|
|
1199 input_sequences_oriented = input_sequences_copy
|
|
1200 graph_info = NULL
|
|
1201 }
|
|
1202
|
|
1203
|
|
1204
|
|
1205 kmer_counts = list()
|
|
1206 kmer_lengths = seq(min_kmer_length, max_kmer_length, by = 4)
|
|
1207 for (i in kmer_lengths) {
|
|
1208 ## use pythonis.null(l[[i]])) function - faster
|
|
1209 cmd = paste(script.dir, "/kmer_counting.py ", input_sequences_oriented, " ",
|
|
1210 i, sep = "")
|
|
1211 cat("calculation kmers of length ", i, "\n")
|
|
1212 f = system(cmd, intern = TRUE)
|
|
1213 x = read.table(f, as.is = TRUE, sep = "\t")
|
|
1214 kmer_counts[[as.character(i)]] = data.frame(x, freq = x$V2/sum(x$V2))
|
|
1215 }
|
|
1216
|
|
1217 ## number of kmers:
|
|
1218 N50 = sapply(kmer_counts, function(x) {
|
|
1219 sum(cumsum(x$freq) < 0.5)
|
|
1220 })
|
|
1221
|
|
1222 N70 = sapply(kmer_counts, function(x) {
|
|
1223 sum(cumsum(x$freq) < 0.7)
|
|
1224 })
|
|
1225
|
|
1226
|
|
1227 time1 = Sys.time()
|
|
1228 ggmin = mclapply(kmer_counts, get_mimimal_cc, start = 0.5, mc.cores = CPU)
|
|
1229 time2 = Sys.time()
|
|
1230 cat("kmers graphs created ")
|
|
1231 print(time2 - time1)
|
|
1232 names(ggmin) = names(kmer_counts)
|
|
1233
|
|
1234
|
|
1235 ## estimate monomer
|
|
1236 monomers = mclapply(ggmin, estimate_monomer, mc.cores = CPU)
|
|
1237
|
|
1238 names(monomers) = names(kmer_counts)
|
|
1239 monomers = monomers[!sapply(monomers, is.null)]
|
|
1240 ## error handling:
|
|
1241 error = sapply(monomers, class) == "try-error"
|
|
1242 if (any(error)) {
|
|
1243 cat("\nError in monomers estimation: ")
|
|
1244 cat("calculation failed for monomers length ", names(monomers)[error], "\n")
|
|
1245 print(monomers[error])
|
|
1246 if (any(!error)) {
|
|
1247 monomers = monomers[!error]
|
|
1248 } else {
|
|
1249 stop("monomer estimation failed")
|
|
1250 }
|
|
1251 }
|
|
1252
|
|
1253 ## summary - make function!!
|
|
1254 total_score = list()
|
|
1255 k = 0
|
|
1256 length_best = numeric()
|
|
1257 score_bn = numeric()
|
|
1258 consensus = character()
|
|
1259
|
|
1260 for (i in names(monomers)) {
|
|
1261 for (v in seq_along(monomers[[i]]$estimates)) {
|
|
1262 k = k + 1
|
|
1263 total_score[[k]] = c(kmer = as.numeric(i), variant = v, total_score = monomers[[i]]$estimates[[v]]$total_score)
|
|
1264 score_bn[k] = min(rowSums(monomers[[i]]$estimates[[v]]$CM))
|
|
1265 length_best[k] = monomers[[i]]$estimates[[v]]$length_variant_score[1,
|
|
1266 1]
|
|
1267 consensus[[k]] = monomers[[i]]$estimates[[v]]$consensus2
|
|
1268 }
|
|
1269 }
|
|
1270 summary_table = as.data.frame(do.call(rbind, total_score))
|
|
1271 summary_table$monomer_length = length_best
|
|
1272 summary_table$consensus_length = nchar(consensus)
|
|
1273 summary_table$score_bn = score_bn
|
|
1274 summary_table$consensus = paste("<pre>", wrap(consensus, width = 80), "<pre>",
|
|
1275 sep = "")
|
|
1276 consensus_DS = DNAStringSet(consensus)
|
|
1277 names(consensus_DS) = with(summary_table, paste0(kmer, "_", variant, "_sc_",
|
|
1278 signif(total_score), "_l_", monomer_length))
|
|
1279
|
|
1280 ## filter by score - keep
|
|
1281
|
|
1282 ## reorder by score
|
|
1283 consensus_DS = consensus_DS[order(summary_table$total_score, decreasing = TRUE)]
|
|
1284 summary_table = summary_table[order(summary_table$total_score, decreasing = TRUE),
|
|
1285 ]
|
|
1286 rownames(summary_table) = NULL
|
|
1287 N = nrow(summary_table)
|
|
1288 ## concatenate concensus(ie. make dimer head 2 tail) for pbs detection and other
|
|
1289 ## make something like 'pseudo contig' multimer for mapping - min length 200 bp
|
|
1290
|
|
1291 ## searches
|
|
1292 consensus_DS_dimer = DNAStringSet(paste0(consensus_DS, consensus_DS))
|
|
1293 tarean_contigs = DNAStringSet(sapply(consensus_DS,function(x)
|
|
1294 ifelse(nchar(x)<200,
|
|
1295 paste(rep(as.character(x),round(300/nchar(as.character(x))+1)),collapse=''),
|
|
1296 as.character(x)))
|
|
1297 )
|
|
1298
|
|
1299 names(consensus_DS_dimer) = names(consensus_DS)
|
|
1300 # save sequences:
|
|
1301 consensus_DS_dimer_file = paste0(output_dir, "/consensus_dimer.fasta")
|
|
1302 consensus_DS_file = paste0(output_dir, "/consensus.fasta")
|
|
1303 tarean_contig_file = paste0(output_dir, "/tarean_contigs.fasta")
|
|
1304 writeXStringSet(consensus_DS, consensus_DS_file)
|
|
1305 writeXStringSet(tarean_contigs, tarean_contig_file)
|
|
1306 writeXStringSet(consensus_DS_dimer, consensus_DS_dimer_file)
|
|
1307 if (is.null(tRNA_database_path)) {
|
|
1308 pbs_score = -1
|
|
1309 } else {
|
|
1310 pbs_blast_table = paste0(output_dir, "/pbs_detection")
|
|
1311 pbs_score = detect_pbs(dimers_file = consensus_DS_dimer_file, tRNA_database_path = tRNA_database_path,
|
|
1312 reads = input_sequences_oriented, output = pbs_blast_table)
|
|
1313 }
|
|
1314 ## search of open reading frames get the length of the longest
|
|
1315 orf_l = max_ORF_length(consensus_DS_dimer)
|
|
1316
|
|
1317
|
|
1318 dir.create(paste0(output_dir, "/img"), recursive = TRUE)
|
|
1319 summary_table$monomer_length_graph = numeric(N)
|
|
1320 summary_table$graph_image = character(N)
|
|
1321 summary_table$monomer_length_logo = numeric(N)
|
|
1322 summary_table$logo_image = character(N)
|
|
1323 ## export graph nd consensus estimate to cluster directory type of output may
|
|
1324 ## change in future
|
|
1325 save(ggmin, file = paste0(output_dir, "/ggmin.RData"))
|
|
1326 save(monomers, file = paste0(output_dir, "/monomers.RData"))
|
|
1327
|
|
1328 cat("generating HTML output\n")
|
|
1329 for (i in 1:N) {
|
|
1330 kmer = as.character(summary_table$kmer[i])
|
|
1331 variant = summary_table$variant[i]
|
|
1332 ## export graph
|
|
1333 fout_link = paste0("img/graph_", kmer, "mer_", variant, ".png")
|
|
1334 fout = paste0(output_dir, "/", fout_link)
|
|
1335 ## summary_table$monomer_length_graph[i] = summary_table$monomer_length[i]
|
|
1336 ## summary_table$monomer_length_logo[[i]] = nrow(monomers[[kmer]]$estimates[[variant]]$CM)
|
|
1337 summary_table$monomer_length[[i]] = length(monomers[[kmer]]$estimates[[variant]]$consensus)
|
|
1338
|
|
1339 if (i <= 10) {
|
|
1340 png(fout, width = 800, height = 800)
|
|
1341 plot_kmer_graph(ggmin[[kmer]], highlight = unlist(monomers[[kmer]]$paths[[variant]]$tr_paths))
|
|
1342 dev.off()
|
|
1343 summary_table$graph_image[i] = hwriteImage(fout_link, link = fout_link,
|
|
1344 table = FALSE, width = 100, height = 100)
|
|
1345 ## export logo
|
|
1346 png_link = paste0("img/logo_", kmer, "mer_", variant, ".png")
|
|
1347 fout = paste0(output_dir, "/", png_link)
|
|
1348 png(fout, width = 1200, height = round(summary_table$monomer_length[i] *
|
|
1349 1) + 550)
|
|
1350 try(plot_multiline_logo(monomers[[kmer]]$estimates[[variant]]$CM, W = 100))
|
|
1351 dev.off()
|
|
1352 ## export corresponding position probability matrices
|
|
1353 ppm_file = paste0(output_dir, '/ppm_', kmer, "mer_", variant, ".csv")
|
|
1354 ppm_link = paste0('ppm_', kmer, "mer_", variant, ".csv")
|
|
1355 write.table(monomers[[kmer]]$estimates[[variant]]$CM,
|
|
1356 file = ppm_file,
|
|
1357 col.names = TRUE, quote = FALSE,
|
|
1358 row.names = FALSE, sep="\t")
|
|
1359 summary_table$logo_image[i] = hwriteImage(png_link, link = ppm_link,
|
|
1360 table = FALSE, width = 200, height = 100)
|
|
1361 }
|
|
1362
|
|
1363 }
|
|
1364
|
|
1365 ## html_report = HTMLReport()
|
|
1366
|
|
1367 htmlfile = paste0(output_dir, "/report.html")
|
|
1368 cat(htmlheader, file = htmlfile)
|
|
1369 included_columns = c('kmer', 'variant', 'total_score', 'consensus_length','consensus', 'graph_image', 'logo_image')
|
|
1370 summary_table_clean = summary_table[,included_columns]
|
|
1371 colnames(summary_table_clean) = c('k-mer length',
|
|
1372 'Variant index',
|
|
1373 'k-mer coverage score',
|
|
1374 'Consensus length',
|
|
1375 'Consensus sequence',
|
|
1376 'k-mer based graph',
|
|
1377 'Sequence logo')
|
|
1378 HTML(summary_table_clean, file = htmlfile, sortableDF = TRUE)
|
|
1379 HTMLEndFile(file = htmlfile)
|
|
1380 time4 = Sys.time()
|
|
1381 print(time4 - time0)
|
|
1382 if (!is.null(lock_file)){
|
|
1383 print("------removing-lock--------")
|
|
1384 removelock(lock_file)
|
|
1385 }
|
|
1386
|
|
1387 print(list(htmlfile = htmlfile, TR_score = summary_table$total_score[1],
|
|
1388 TR_monomer_length = as.numeric(summary_table$consensus_length[1]),
|
|
1389 TR_consensus = summary_table$consensus[1], pbs_score = pbs_score, graph_info = graph_info,
|
|
1390 orf_l = orf_l, tarean_contig_file = tarean_contig_file))
|
|
1391 return(list2dictionary(list(htmlfile = htmlfile, TR_score = summary_table$total_score[1],
|
|
1392 TR_monomer_length = as.numeric(summary_table$consensus_length[1]),
|
|
1393 TR_consensus = summary_table$consensus[1], pbs_score = pbs_score, graph_info = graph_info,
|
|
1394 orf_l = orf_l, tarean_contig_file = tarean_contig_file)))
|
|
1395 }
|
|
1396
|
|
1397
|
|
1398 ## graph loop index stability
|
|
1399 loop_index_instability = function(G) {
|
|
1400 N = 50
|
|
1401 s = seq(vcount(G), vcount(G)/10, length.out = N)
|
|
1402 p = seq(1, 0.1, length.out = N)
|
|
1403 li = numeric()
|
|
1404 for (i in seq_along(s)) {
|
|
1405 print(i)
|
|
1406 gs = induced_subgraph(G, sample(1:vcount(G), s[i]))
|
|
1407 li[i] = max(clusters(gs, "strong")$csize)/vcount(gs)
|
|
1408 }
|
|
1409 instability = lm(li ~ p)$coefficient[2]
|
|
1410 return(instability)
|
|
1411 }
|
|
1412
|
|
1413 isSatellite = function(x, y, model) {
|
|
1414 p = get_prob(x, y, model)
|
|
1415 if (p > model$cutoff) {
|
|
1416 return("Putative Satellite")
|
|
1417 } else {
|
|
1418 return("")
|
|
1419 }
|
|
1420 }
|
|
1421
|
|
1422 get_prob = function(x, y, model) {
|
|
1423 pm = model$prob_matrix
|
|
1424 N = ncol(pm)
|
|
1425 i = round(x * (N - 1)) + 1
|
|
1426 j = round(y * (N - 1)) + 1
|
|
1427 p = pm[i, j]
|
|
1428 return(p)
|
|
1429 }
|
|
1430
|
|
1431
|
|
1432 detectMemUsage = function() {
|
|
1433 con = textConnection(gsub(" +", " ", readLines("/proc/meminfo")))
|
|
1434 memInfo = read.table(con, fill = TRUE, row.names = 1)
|
|
1435 close(con)
|
|
1436 memUsage = 1 - (memInfo["MemFree", 1] + memInfo["Cached", 1])/memInfo["MemTotal",
|
|
1437 1]
|
|
1438 return(memUsage)
|
|
1439 }
|
|
1440
|
|
1441
|
|
1442 makelock<-function(lockfile,lockmsg,CreateDirectories=TRUE){
|
|
1443 lockdir=dirname(lockfile)
|
|
1444 if(!file.exists(lockdir)){
|
|
1445 if(CreateDirectories) dir.create(lockdir,recursive=TRUE)
|
|
1446 else stop("Lock Directory for lockfile ",lockfile," does not exist")
|
|
1447 }
|
|
1448 if(missing(lockmsg)) lockmsg=paste(system('hostname',intern=TRUE),Sys.getenv("R_SESSION_TMPDIR"))
|
|
1449 if (file.exists(lockfile)) return (FALSE)
|
|
1450 # note the use of paste makes the message writing atomic
|
|
1451 cat(paste(lockmsg,"\n",sep=""),file=lockfile,append=TRUE,sep="")
|
|
1452 firstline=readLines(lockfile,n=1)
|
|
1453 if(firstline!=lockmsg){
|
|
1454 # somebody else got there first
|
|
1455 return(FALSE)
|
|
1456 } else return(TRUE)
|
|
1457 }
|
|
1458
|
|
1459
|
|
1460 removelock<-function(lockfile){
|
|
1461 if(unlink(lockfile)!=0) {
|
|
1462 warning("Unable to remove ",lockfile)
|
|
1463 return (FALSE)
|
|
1464 }
|
|
1465 return (TRUE)
|
|
1466 }
|
|
1467
|
|
1468
|
|
1469 waitForRAM = function(p = 0.5,lock_file=NULL) {
|
|
1470 if (detectMemUsage() < p) {
|
|
1471 return(NULL)
|
|
1472 ## check lock file:
|
|
1473 } else {
|
|
1474 cat("waiting for RAM \n")
|
|
1475 free_count = 0
|
|
1476 while (TRUE) {
|
|
1477 if (makelock(lock_file)){
|
|
1478 print("---------locking--------")
|
|
1479 return(lock_file)
|
|
1480 }
|
|
1481 if (detectMemUsage() < p) {
|
|
1482 cat("RAM freed \n")
|
|
1483 return(NULL)
|
|
1484 }
|
|
1485 Sys.sleep(5)
|
|
1486 if (evaluate_user_cpu_usage() == 'free'){
|
|
1487 free_count = free_count + 1
|
|
1488 }else{
|
|
1489 free_count = 0
|
|
1490 }
|
|
1491 if (detectMemUsage() < 0.8 & free_count > 100){
|
|
1492 cat("RAM not free but nothing else is running \n")
|
|
1493 return(NULL)
|
|
1494 }
|
|
1495 }
|
|
1496 }
|
|
1497 }
|
|
1498
|
|
1499 lsmem = function() {
|
|
1500 g = globalenv()
|
|
1501 out_all = envs = list()
|
|
1502 envs = append(envs, g)
|
|
1503 total_size = numeric()
|
|
1504 while (environmentName(g) != "R_EmptyEnv") {
|
|
1505 g <- parent.env(g)
|
|
1506 envs = append(envs, g)
|
|
1507 }
|
|
1508 for (e in envs) {
|
|
1509
|
|
1510 obj = ls(envir = e)
|
|
1511 if (length(obj) == 0) {
|
|
1512 break
|
|
1513 }
|
|
1514 obj.size = list()
|
|
1515 for (i in obj) {
|
|
1516 obj.size[[i]] = object.size(get(i, envir = e))
|
|
1517 }
|
|
1518 out = data.frame(object = obj, size = unlist(obj.size), stringsAsFactors = FALSE)
|
|
1519 out = out[order(out$size, decreasing = TRUE), ]
|
|
1520 out_all = append(out_all, out)
|
|
1521 total_size = append(total_size, sum(out$size))
|
|
1522 }
|
|
1523 return(list(objects = out_all, total_size = total_size))
|
|
1524 }
|
|
1525
|
|
1526 evaluate_user_cpu_usage = function(){
|
|
1527 user = Sys.info()["user"]
|
|
1528 a = sum(as.numeric (system(paste ("ps -e -o %cpu -u", user), intern = TRUE)[-1]))
|
|
1529 s = substring (system(paste ("ps -e -o stat -u", user), intern = TRUE)[-1],1,1)
|
|
1530 if (a<5 & sum(s %in% 'D')==0 & sum(s%in% 'R')<2){
|
|
1531 status = 'free'
|
|
1532 }else{
|
|
1533 status = 'full'
|
|
1534 }
|
|
1535 return(status)
|
|
1536 }
|