annotate lib/tarean/methods.R @ 10:6212933b1a83 draft

Uploaded
author petrn
date Thu, 02 Jan 2020 10:23:49 +0000
parents f6ebec6e235e
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1 #!/user/bin/env Rscript
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
2
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
3 suppressPackageStartupMessages(library(igraph))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
4 suppressPackageStartupMessages(library(parallel))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
5 suppressPackageStartupMessages(library(Biostrings))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
6 suppressPackageStartupMessages(library(scales))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
7 suppressPackageStartupMessages(library(stringr))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
8 suppressPackageStartupMessages(library(hwriter))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
9 suppressPackageStartupMessages(library(R2HTML))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
10 suppressPackageStartupMessages(library(plyr))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
11 suppressPackageStartupMessages(library(dplyr))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
12
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
13 max_ORF_length = function(s) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
14 ## check all frames
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
15 L = 0
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
16 for (i in 1:3) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
17 L = max(L, nchar(unlist(strsplit(as.character(translate(subseq(s, i))), "*",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
18 fixed = TRUE))))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
19 L = max(L, nchar(unlist(strsplit(as.character(translate(subseq(reverseComplement(s),
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
20 i))), "*", fixed = TRUE))))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
21 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
22 return(L)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
23 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
24
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
25 kmers2graph = function(kmers, mode = "strong", prop = NULL) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
26 kmerLength = nchar(kmers[1, 1])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
27 if (ncol(kmers) == 2) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
28 kmers$size = kmers[, 2]/sum(kmers[, 2])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
29 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
30 colnames(kmers) = c("name", "count", "size")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
31 if (!is.null(prop)) { # tohle se nepouziva(prop je null), a je to asi spatne - filtuje se to pred tridenim!!
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
32 p = cumsum(kmers$size)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
33 kmers = kmers[p < prop, ]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
34 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
35 N = dim(kmers)[1]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
36 kmers = kmers[order(kmers$size), ]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
37 ## convert kmers to fasta file
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
38 kms = data.frame(kmer = substring(kmers$name, 1, kmerLength - 1), ids = 1:nrow(kmers),stringsAsFactors = FALSE)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
39 kme = data.frame(kmer = substring(kmers$name, 2), ide = 1:nrow(kmers), stringsAsFactors = FALSE)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
40
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
41 ## df = merge(kms,kme, by = 'kmer',all=FALSE)[,c(2,3,1)]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
42 df = inner_join(kme,kms, by = 'kmer')[,c(2,3)]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
43
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
44 ## names(kms) = seq_along(kms)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
45 ## kme = substring(kmers$name, 2)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
46 ## names(kme) = seq_along(kme)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
47 ## ## use new blast!
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
48 ## database = tempfile()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
49 ## query = tempfile()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
50 ## output = tempfile()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
51 ## writeXStringSet(DNAStringSet(kms), filepath = database, format = "fasta")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
52 ## writeXStringSet(DNAStringSet(kme), filepath = query, format = "fasta")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
53 ## cmd = paste("makeblastdb -in", database, "-dbtype nucl")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
54 ## system(cmd, ignore.stdout = TRUE)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
55 ## cmd = paste("blastn -outfmt '6 qseqid sseqid pident' -strand plus -dust no -perc_identity 100 -query ",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
56 ## query, "-db", database, "-word_size", kmerLength - 1, "-out", output)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
57 ## system(cmd)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
58 ## df = try({
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
59 ## read.table(output, as.is = TRUE)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
60 ## })
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
61 ## if (class(df) == "try-error"){
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
62 ## print("creation of kmer graph failed")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
63 ## print(query)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
64 ## print(output)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
65 ## print(database)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
66 ## return(NULL)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
67 ## }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
68 ## unlink(query)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
69 ## unlink(paste(database, "*", sep = ""))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
70 ## unlist(output)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
71 gm_mean = function(x, na.rm = TRUE) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
72 exp(sum(log(x[x > 0]), na.rm = na.rm)/length(x))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
73 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
74
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
75 whg = apply(cbind(kmers[df[, 1], 2], V2 = kmers[df[, 2], 2]), 1, gm_mean)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
76 G = graph.data.frame(data.frame(V1 = kmers$name[df[, 1]], V2 = kmers$name[df[,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
77 2]], weight = whg), vertices = kmers[, 1:3])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
78 # separate to connected components:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
79 ccs = clusters(G, mode = mode)$membership
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
80 sel_cls = which(tabulate(ccs) > 1)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
81 Gs = list()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
82 for (i in seq_along(sel_cls)) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
83 Gs[[i]] = induced.subgraph(G, vids = which(ccs %in% sel_cls[i]))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
84 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
85 ## reorder!!!
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
86 Gs = Gs[order(sapply(Gs, vcount), decreasing = TRUE)]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
87 return(Gs)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
88 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
89
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
90
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
91 OGDFlayout = function(G, ncol = NULL, alg = "fmmm", OGDF = getOption("OGDF")) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
92 ## is ogdf binary available?
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
93 if (is.null(OGDF)) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
94 OGDF = Sys.getenv("OGDF")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
95 if ("" == OGDF) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
96 options(warn = -1)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
97 OGDF = system("which runOGDFlayout", intern = TRUE)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
98 options(warn = 0)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
99 if (length(OGDF) == 0) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
100 cat("path to runOGDFlayout not found\n")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
101 return(NULL)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
102 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
103
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
104 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
105 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
106 if (is.null(ncol)) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
107 if (is.null(E(G)$weight)) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
108 el = data.frame(get.edgelist(G, names = TRUE), rep(1, ecount(G)))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
109 } else {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
110 el = data.frame(get.edgelist(G, names = TRUE), E(G)$weight)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
111 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
112 ncol = paste(tempfile(pattern = as.character(Sys.getpid())), ".layout", sep = "")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
113 write.table(el, file = ncol, row.names = FALSE, col.names = FALSE, sep = "\t",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
114 quote = FALSE)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
115 } else {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
116 # copy ncol:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
117 ncol_tmp = paste(tempfile(pattern = as.character(Sys.getpid())), ".layout",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
118 sep = "")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
119 file.copy(ncol, ncol_tmp)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
120 ncol = ncol_tmp
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
121 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
122 algopt = c("fmmm", "sm", "fme")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
123 if (!(alg %in% c("fmmm", "sm", "fme") && TRUE)) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
124 stop("alg must by :", algopt, "\n")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
125
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
126 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
127
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
128 # output file:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
129 Louts = list()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
130 layout_file = tempfile(pattern = as.character(Sys.getpid()))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
131 for (i in alg) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
132 cmd = paste(OGDF, "-alg", i, "-iff layout -off layout -out", layout_file,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
133 ncol)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
134 system(cmd, intern = TRUE)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
135 L = read.table(layout_file, skip = ecount(G))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
136 L = L[match(V(G)$name, L$V2), ]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
137 Lout = as.matrix(L[, -(1:2)])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
138 unlink(layout_file)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
139 Louts[[i]] = Lout
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
140
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
141 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
142 # clean up
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
143 unlink(ncol)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
144 return(Louts)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
145 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
146
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
147 xcolor_code = c(A = "#FF0000", C = "#00FF00", T = "#0000FF", G = "#FFFF00")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
148
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
149 kmers2color = function(s, position = NULL) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
150 if (is.null(position)) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
151 position = round(nchar(s[1])/2, 0)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
152 ## position = 1
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
153 position = nchar(s[1])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
154 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
155 color_code = c(A = "#FF0000", C = "#00FF00", T = "#0000FF", G = "#FFFF00")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
156 color_base = substring(s, position, position)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
157 colors = color_code[color_base]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
158 names(colors) = color_base
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
159 return(colors)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
160 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
161 get_sequence = function(g, v, position = NULL) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
162 s = V(g)$name
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
163 if (is.null(position)) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
164 position = round(nchar(s[1])/2, 0)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
165 ## position = 1
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
166 position = nchar(s[1])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
167 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
168 nt = paste(substring(s[v], position, position), collapse = "")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
169 return(nt)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
170 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
171
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
172
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
173 get_mimimal_cc = function(km, thr = 20, min_coverage = 0.45, step = 2, start = NULL) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
174 if (is.null(start)) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
175 i = sum(cumsum(km$freq) < 0.5)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
176 } else {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
177 i = sum(cumsum(km$freq) < start)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
178 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
179 continue = TRUE
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
180 while (continue) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
181 if (i > nrow(km)) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
182 i = nrow(km)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
183 continue = FALSE
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
184 step = 1
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
185 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
186 GG = kmers2graph(km[1:i, ])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
187 if (length(GG) > 0) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
188 if (vcount(GG[[1]]) > thr) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
189 if (sum(V(GG[[1]])$size) >= min_coverage) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
190 GG[[1]]$input_coverage = sum(km$freq[1:i])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
191 GG[[1]]$L = OGDFlayout(GG[[1]])[[1]]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
192 return(GG[[1]])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
193 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
194 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
195 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
196 i = round(i * step)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
197
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
198 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
199 if (length(GG) == 0 | is.null(GG)) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
200 return(NULL)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
201 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
202
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
203 GG[[1]]$input_coverage = sum(km$freq[1:i])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
204 GG[[1]]$L = OGDFlayout(GG[[1]])[[1]]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
205 return(GG[[1]])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
206 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
207
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
208
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
209
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
210 paths2string = function(paths) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
211 pathstring = sapply(lapply(lapply(paths, as_ids), substring, 1, 1), paste, collapse = "")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
212 return(pathstring)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
213 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
214
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
215
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
216 align_paths = function(paths, G) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
217 shift = rep(NA, length(paths))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
218 thr = 0 # minimal length
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
219 tr_paths = list()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
220 Seqs = list()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
221 centre_node = as.numeric(names(sort(table(unlist(paths)), decreasing = TRUE)))[[1]]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
222
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
223 for (i in seq_along(paths)) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
224 if (centre_node %in% paths[[i]]) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
225 S = which(paths[[i]] %in% centre_node)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
226 shift[i] = S
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
227 if (S == 1) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
228 tr_paths[[i]] = paths[[i]]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
229 } else {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
230 tr_paths[[i]] = c(paths[[i]][S:length(paths[[i]])], paths[[i]][1:(S -
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
231 1)])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
232 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
233 Seqs[[i]] = get_sequence(G, tr_paths[[i]])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
234 } else {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
235 shift[i] = NA
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
236 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
237 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
238 paths_n = lapply(paths, as.numeric)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
239 tr_paths_n = do.call(cbind, lapply(tr_paths, as.numeric))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
240 new_shift = shift
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
241 for (i in which(is.na(shift))) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
242 score = numeric(length(paths_n))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
243 for (S in seq_along(paths_n[[i]])) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
244 if (S == 1) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
245 path_tmp_n = paths_n[[i]]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
246 } else {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
247 path_tmp_n = c(paths_n[[i]][S:length(paths_n[[i]])], paths_n[[i]][1:(S -
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
248 1)])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
249 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
250 score[S] = sum(tr_paths_n == path_tmp_n)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
251 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
252 if (sum(score) != 0) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
253 S = which.max(score)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
254 new_shift[i] = S
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
255 if (S == 1) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
256 tr_paths[[i]] = paths[[i]]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
257 } else {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
258 tr_paths[[i]] = c(paths[[i]][S:length(paths[[i]])], paths[[i]][1:(S -
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
259 1)])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
260 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
261 Seqs[[i]] = get_sequence(G, tr_paths[[i]])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
262 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
263 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
264 shift = new_shift
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
265 # try to shift based on the sequences itself
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
266 return(list(Seqs = Seqs[!is.na(shift)], tr_paths = tr_paths[!is.na(shift)], shift = shift[!is.na(shift)]))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
267 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
268
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
269 make_consensus = function(paths_info, G) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
270 include = !is.na(paths_info$shift)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
271 ## make alignments using mafft
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
272 aln = mafft(unlist(paths_info$Seqs[include]))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
273 CM = calculate_consensus_matrix(aln = aln, tr_paths = paths_info$tr_paths[include],
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
274 G = G)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
275 gaps = get_gaps_from_alignment(aln)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
276 CMnorm = CM/rowSums(CM)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
277 bases = colnames(CM)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
278 consensus = sapply(apply(CMnorm, 1, function(x) bases[which(x > 0.2)][order(x[x >
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
279 0.2], decreasing = TRUE)]), paste, collapse = "")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
280 consensus2 = gsub("-", "", paste0(substring(consensus, 1, 1), collapse = ""),
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
281 fixed = TRUE)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
282 number_of_SNP = sum(rowSums(CM > 0) > 1)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
283 SNP_positions = which(rowSums(CM > 0) > 1)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
284 number_of_position_with_indels = sum(colSums(do.call(rbind, strsplit(as.character(aln),
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
285 "")) == "-") > 0)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
286 indel_positions = which(colSums(do.call(rbind, strsplit(as.character(aln), "")) ==
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
287 "-") > 0)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
288 if (length(SNP_positions) > 0) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
289 variable_sites = unlist(c(c(mapply(paste, strsplit((sapply(apply(CMnorm,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
290 1, function(x) bases[which(x > 0.2)]), paste, collapse = ""))[SNP_positions],
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
291 ""), SNP_positions, sep = "_")), paste("-", indel_positions, sep = "_")))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
292 } else {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
293 variable_sites = NULL
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
294 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
295 variable_positions = unique(SNP_positions, indel_positions)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
296 return(list(aln = aln, CM = CM, CMnorm = CMnorm, consensus = consensus, consensus2 = consensus2,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
297 number_of_SNP = number_of_SNP, SNP_positions = SNP_positions, number_of_position_with_indels = number_of_position_with_indels,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
298 indel_positions = indel_positions, variable_positions = variable_positions,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
299 variable_sites = variable_sites, gaps = gaps))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
300 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
301
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
302
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
303 estimate_monomer = function(G, weights = NULL, limit = NULL) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
304 if (is.null(G)) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
305 return(NULL)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
306 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
307 ## estimate monomer from kmer based graph
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
308 V(G)$id = 1:vcount(G)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
309 GS = induced_subgraph(G, vids = which(degree(G) == 2)) ## keep only vertices without branching
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
310 cls = clusters(GS)$membership
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
311
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
312
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
313 ids = mapply(FUN = function(x, y) x[which.max(y)], split(V(GS)$id, cls), split(V(GS)$size,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
314 cls)) ## from each branch use only one vertex with larges size!
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
315
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
316
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
317 ids = ids[order(V(G)$size[ids], decreasing = TRUE)]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
318 ids_size = V(G)$size[ids]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
319 N50 = sum(cumsum(ids_size)/sum(ids_size) < 0.5)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
320 if (length(ids) > 10000) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
321 ids = ids[1:N50]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
322 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
323 ## use only large vertices in search! how many?
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
324 el = get.edgelist(G, names = FALSE)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
325 node_use = numeric(vcount(G))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
326 LL = numeric()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
327 ## W=numeric()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
328 i = 0
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
329 paths = list()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
330 if (is.null(weights)) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
331 weights = (max(E(G)$weight) - (E(G)$weight) + 1)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
332 weights = E(G)$weight^(-3)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
333 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
334 included = rep(FALSE, vcount(G))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
335 W_total = sum(V(G)$size)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
336
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
337 coverage = numeric()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
338 t0 = c()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
339 i = 0
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
340 j = 0
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
341 for (n in ids) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
342 j = j + 1
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
343 t0[j] = Sys.time()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
344 if (included[n]) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
345 next
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
346 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
347 m = which(el[, 1] %in% n)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
348 i = i + 1
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
349 s = get.shortest.paths(G, el[m, 2], el[m, 1], weights = weights, output = "vpath")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
350 included[as.numeric(s$vpath[[1]])] = TRUE
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
351 paths[[i]] = s$vpath[[1]]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
352 LL[i] = (length(s$vpath[[1]]))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
353 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
354
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
355 ## evaluate if paths should be divided to variants - by length and path weight
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
356 paths_clusters = split(paths, LL)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
357 paths_clusters_tr = mclapply(paths_clusters, FUN = align_paths, G = G, mc.cores = getOption("CPU"))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
358
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
359 ## paths_clusters_tr = lapply(paths_clusters, FUN = align_paths, G = G) consensus
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
360 paths_consensus = mclapply(paths_clusters_tr, make_consensus, G = G, mc.cores = getOption("CPU"))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
361
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
362 ## evaluate weight for individual paths:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
363 for (v in seq_along(paths_consensus)) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
364 p = paths_clusters_tr[[v]]$tr_paths
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
365 ## clean
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
366 p = p[!sapply(p, function(x) anyNA(x) | is.null(x))]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
367 L = sapply(p, length)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
368 p_groups = split(p, L)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
369 w_groups = sapply(p_groups, function(x) sum(V(G)$size[unique(c(sapply(x,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
370 as.numeric)))]))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
371 total_score = sum(V(G)$size[unique(c(unlist(sapply(p, as.numeric))))])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
372 LW = data.frame(`Length estimate` = unique(L), weight = w_groups, stringsAsFactors = FALSE)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
373 LW = LW[order(LW$weight, decreasing = TRUE), ]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
374 rownames(LW) = NULL
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
375 paths_consensus[[v]]$total_score = total_score
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
376 paths_consensus[[v]]$length_variant_score = LW
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
377
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
378 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
379 return(list(estimates = paths_consensus, paths = paths_clusters_tr))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
380 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
381
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
382
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
383
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
384 detect_pbs = function(dimers_file, tRNA_database_path, reads_file, output) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
385 ## read_file contain oriented reads!
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
386 min_offset = 10
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
387 max_end_dist = 2
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
388 min_aln_length = 30
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
389 max_pbs_search = 30
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
390 thr_length = 12
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
391 end_position = 23
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
392 insertion_proportion_threshold <- 0.15
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
393 ## read_file contain oriented reads! for testing
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
394 if (FALSE) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
395 library(Biostrings)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
396 thr_length = 12
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
397 end_position = 23
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
398 dimers_file = "consensus_dimer.fasta"
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
399 reads_file = "reads_oriented.fas"
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
400 tRNA_database_path = "/mnt/raid/users/petr/workspace/repex_tarean/databases/tRNA_database2.fasta"
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
401
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
402 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
403 ## find read which are half aligned do reference dimer
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
404 dimers_seq = readDNAStringSet(dimers_file)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
405 cmd = paste("makeblastdb -in", dimers_file, "-dbtype nucl")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
406 system(cmd, ignore.stdout = TRUE)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
407 output = paste0(reads_file, "blast_out.cvs")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
408 columns = c("qseqid", "qstart", "qend", "qlen", "sseqid", "sstart", "send", "sstrand",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
409 "slen", "pident", "length")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
410 cmd = paste("blastn -outfmt '6", paste(columns, collapse = " "), "' -perc_identity 90 -query ",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
411 reads_file, "-db", dimers_file, "-word_size", 7, "-dust no -num_alignments 99999 -strand plus ",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
412 "-out", output)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
413 system(cmd)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
414 blastdf = read.table(output, as.is = TRUE, col.names = columns, comment.char = "")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
415 unlink(output)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
416 blastdf = blastdf[blastdf$length >= min_aln_length, ]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
417
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
418 ## expand two whole read to see unligned reads parts
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
419 blastdf_expand = blastdf
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
420 blastdf_expand$qstart = blastdf$qstart - (blastdf$qstart - 1)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
421 blastdf_expand$sstart = blastdf$sstart - (blastdf$qstart - 1)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
422 blastdf_expand$qend = blastdf$qend + (blastdf$qlen - blastdf$qend)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
423 blastdf_expand$send = blastdf$send + (blastdf$qlen - blastdf$qend)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
424 pS = blastdf$sstart
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
425 pE = blastdf$send
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
426 pSF = blastdf_expand$sstart
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
427 pEF = blastdf_expand$send
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
428 cond1 = pS - pSF >= min_offset & blastdf$qend >= (blastdf$qlen - max_end_dist) &
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
429 blastdf$sstart >= max_pbs_search
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
430 cond2 = pEF - pE >= min_offset & blastdf$qstart <= max_end_dist & blastdf$send <=
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
431 blastdf$slen - max_pbs_search
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
432
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
433 ## coverage of alignments: evaluate coverage at site with breaks, it is neccessary
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
434 ## that there must be at least 20% of interupted alignments at given position to
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
435 ## be considered for additional search
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
436 coverage_profiles = subject_coverage(blastdf)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
437
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
438
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
439 ## extract flanking sequences - cca 50nt, it should contain tRNA sequence search
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
440 ## Left
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
441 fin = tempfile()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
442 fout = tempfile()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
443 scoreL = scoreR = 0
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
444
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
445
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
446 ## left side unique insertion sites
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
447 if (any(cond1)) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
448 insertion_sites = ddply(blastdf[cond1, ], .(sseqid, sstart), nrow)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
449 ## check coverage
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
450 insertion_sites$coverage = 0
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
451 for (i in 1:nrow(insertion_sites)) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
452 insertion_sites$coverage[i] = coverage_profiles[[insertion_sites$sseqid[i]]][insertion_sites$sstart[[i]]]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
453 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
454 insertion_OK_left <- insertion_sites[with(insertion_sites, V1/coverage >
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
455 insertion_proportion_threshold), ]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
456
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
457 if (nrow(insertion_OK_left) > 0) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
458 s = ifelse(insertion_OK_left[, "sstart"] - max_pbs_search < 1, 1, insertion_OK_left[,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
459 "sstart"] - max_pbs_search)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
460 Lpart = subseq(dimers_seq[match(insertion_OK_left$sseqid, names(dimers_seq))],
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
461 s, insertion_OK_left$sstart)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
462
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
463 ## names are CONSENSUSID__READID_position
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
464 names(Lpart) = paste0(names(Lpart), "__", insertion_OK_left$V1, "__",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
465 insertion_OK_left$sstart)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
466 ## check presence TG dinucleotide
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
467 TG = vcountPattern("TG", subseq(dimers_seq[match(insertion_OK_left$sseqid,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
468 names(dimers_seq))], insertion_OK_left$sstart - 2, insertion_OK_left$sstart +
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
469 2))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
470 if (any(TG > 0)) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
471 writeXStringSet(Lpart[TG > 0], filepath = fin)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
472 cmd = paste("blastn -outfmt '6", paste(columns, collapse = " "),
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
473 "' -perc_identity 83 -query ", fin, "-db", tRNA_database_path,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
474 "-word_size", 7, "-strand plus -dust no -max_target_seqs 10000 -evalue 100",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
475 "-out", fout)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
476
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
477 system(cmd, ignore.stdout = TRUE)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
478 df = read.table(fout, as.is = TRUE, col.names = columns, comment.char = "")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
479 filter1 = df$length >= thr_length
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
480 filter2 = ifelse(df$send > df$sstart, df$send, df$sstart) >= end_position
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
481 df_pass = df[filter1 & filter2, , drop = FALSE]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
482 df_pass_L = df_pass[!duplicated(df_pass$qseqid), , drop = FALSE]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
483 scoreL = get_score(df_pass_L)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
484 write.table(df_pass_L, file = paste0(output, "_L.csv"), row.names = FALSE,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
485 sep = "\t")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
486 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
487 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
488 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
489 if (any(cond2)) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
490 ## search Right
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
491 insertion_sites = ddply(blastdf[cond2, ], .(sseqid, send, slen), nrow)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
492 ## check coverage
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
493 insertion_sites$coverage = 0
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
494 for (i in 1:nrow(insertion_sites)) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
495 insertion_sites$coverage[i] = coverage_profiles[[insertion_sites$sseqid[i]]][insertion_sites$send[[i]]]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
496 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
497 insertion_OK_right <- insertion_sites[with(insertion_sites, V1/coverage >
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
498 insertion_proportion_threshold), ]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
499
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
500 if (nrow(insertion_OK_right) > 0) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
501 s = ifelse(insertion_OK_right$send + max_pbs_search > insertion_OK_right$slen,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
502 insertion_OK_right$slen, insertion_OK_right$send + max_pbs_search)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
503 Rpart = subseq(dimers_seq[match(insertion_OK_right$sseqid, names(dimers_seq))],
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
504 insertion_OK_right$send, s)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
505 names(Rpart) = paste0(names(Rpart), "__", insertion_OK_right$V1, "__",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
506 insertion_OK_right$send)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
507
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
508 ## check presence CA dinucleotide
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
509 CA = vcountPattern("CA", subseq(dimers_seq[match(insertion_OK_right$sseqid,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
510 names(dimers_seq))], insertion_OK_right$send - 2, insertion_OK_right$send +
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
511 2, ))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
512 if (any(CA > 0)) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
513 writeXStringSet(Rpart[CA > 0], filepath = fin)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
514 cmd = paste("blastn -outfmt '6", paste(columns, collapse = " "),
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
515 "' -perc_identity 83 -query ", fin, "-db", tRNA_database_path,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
516 "-word_size", 7, "-strand minus -dust no -max_target_seqs 10000 -evalue 100",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
517 "-out", fout)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
518
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
519 system(cmd, ignore.stdout = TRUE)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
520 df = read.table(fout, as.is = TRUE, col.names = columns, comment.char = "")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
521 filter1 = df$length >= thr_length
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
522 filter2 = ifelse(df$send > df$sstart, df$send, df$sstart) >= end_position
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
523 df_pass = df[filter1 & filter2, , drop = FALSE]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
524 df_pass_R = df_pass[!duplicated(df_pass$qseqid), , drop = FALSE]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
525 write.table(df_pass_R, file = paste0(output, "_R.csv"), row.names = FALSE,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
526 sep = "\t")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
527 scoreR = get_score(df_pass_R)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
528 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
529 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
530 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
531 unlink(fin)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
532 unlink(fout)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
533 return(max(scoreL, scoreR))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
534 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
535
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
536
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
537 subject_coverage = function(blastdf) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
538 ## calculate coverage for all blast subjects
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
539 coverage_profiles <- by(blastdf, INDICES = blastdf$sseqid, FUN = function(x) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
540 as.numeric(coverage(IRanges(start = x$sstart, end = x$send)))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
541 })
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
542 return(coverage_profiles)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
543
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
544 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
545
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
546 get_score = function(x) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
547 ## keep best tRNA
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
548 if (nrow(x) == 0) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
549 return(0)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
550 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
551 xm = data.frame(do.call(rbind, strsplit(x$qseqid, "__")), stringsAsFactors = FALSE)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
552 xm$score = as.numeric(sapply(strsplit(xm[, 1], "_"), "[[", 4))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
553 xm$AA = gsub("-.+$", "", gsub("^.+__", "", x$sseqid))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
554 best_score = max(by(xm$score, INDICES = xm$AA, FUN = sum))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
555 return(best_score)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
556 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
557
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
558
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
559
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
560 dotter = function(seq1, seq2 = NULL, params = "") {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
561 if (is.null(seq2)) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
562 seq2 = seq1
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
563 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
564 library(Biostrings)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
565 if (class(seq1) != "DNAStringSet") {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
566 seq1 = BStringSet(seq1)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
567 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
568 if (class(seq2) != "DNAStringSet") {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
569 seq2 = BStringSet(seq2)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
570 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
571 sf1 = tempfile("seq1")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
572 writeXStringSet(seq1, file = sf1)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
573 sf2 = tempfile("seq2")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
574 writeXStringSet(seq2, file = sf2)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
575 system(paste("dotter", params, sf1, sf2), wait = FALSE)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
576 Sys.sleep(2)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
577 unlink(c(sf1, sf2))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
578 return(NULL)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
579 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
580
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
581
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
582
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
583 dotter2 = function(seq1, seq2 = NULL, params = NULL) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
584 if (is.null(seq2)) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
585 seq2 = seq1
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
586 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
587 if (is.null(params)) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
588 params = " -windowsize 30 -threshold 45 "
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
589 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
590 library(Biostrings)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
591 if (class(seq1) != "DNAStringSet") {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
592 seq1 = DNAStringSet(seq1)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
593 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
594 if (class(seq2) != "DNAStringSet") {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
595 seq2 = DNAStringSet(seq2)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
596 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
597 L1 = nchar(seq1)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
598 L2 = nchar(seq2)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
599
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
600 tmpdat1 = tempfile()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
601
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
602 dir.create(tmpdat1)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
603 tmpdat2 = tempfile()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
604 dir.create(tmpdat2)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
605 oridir = getwd()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
606
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
607
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
608 seq1_merged = DNAStringSet(paste(seq1, collapse = ""))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
609 seq2_merged = DNAStringSet(paste(seq2, collapse = ""))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
610 seq2rc_merged = reverseComplement(DNAStringSet(paste(seq2, collapse = "")))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
611
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
612
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
613 sf1 = tempfile("seq1")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
614 writeXStringSet(seq1_merged, filepath = sf1)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
615 sf2 = tempfile("seq2")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
616 sf2rc = tempfile("seq2rc")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
617 writeXStringSet(seq2_merged, filepath = sf2)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
618 writeXStringSet(seq2rc_merged, filepath = sf2rc)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
619
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
620 cmd1 = paste("dotmatcher -graph data -asequence ", sf1, "-bsequence", sf2, params)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
621 cmd2 = paste("dotmatcher -graph data -asequence ", sf1, "-bsequence", sf2rc,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
622 params)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
623 setwd(tmpdat1)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
624 output1 = system(cmd1, intern = TRUE)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
625 setwd(tmpdat2)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
626 output2 = system(cmd2, intern = TRUE)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
627 setwd(oridir)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
628
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
629
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
630
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
631 fout1 = strsplit(tail(output1, n = 1), split = " ")[[1]][2]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
632 rawdat1 = readLines(paste(tmpdat1, "/", fout1, sep = ""))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
633 rawdat1 = rawdat1[1:min(grep("^Rectangle", rawdat1))]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
634
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
635 if (length(rawdat1[grep("^Line", rawdat1)]) == 0) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
636 coord1 = NULL
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
637 } else {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
638 coord1 = apply(sapply(strsplit(rawdat1[grep("^Line", rawdat1)], " "), "[",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
639 c(3, 5, 7, 9, 11)), 1, as.numeric)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
640 coord1 = matrix(coord1, ncol = 5)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
641 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
642
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
643 fout2 = strsplit(tail(output2, n = 1), split = " ")[[1]][2]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
644 rawdat2 = readLines(paste(tmpdat2, "/", fout2, sep = ""))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
645 rawdat2 = rawdat2[1:min(grep("^Rectangle", rawdat2))]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
646
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
647 if (length(rawdat2[grep("^Line", rawdat2)]) == 0) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
648 coord2 = NULL
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
649 } else {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
650 coord2 = apply(sapply(strsplit(rawdat2[grep("^Line", rawdat2)], " "), "[",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
651 c(3, 5, 7, 9, 11)), 1, as.numeric)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
652 coord2 = matrix(coord2, ncol = 5)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
653 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
654 unlink(sf1)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
655 unlink(sf2)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
656 unlink(sf2rc)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
657 unlink(tmpdat1, recursive = TRUE)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
658 unlink(tmpdat2, recursive = TRUE)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
659
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
660 N1 = sum(nchar(seq1))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
661 N2 = sum(nchar(seq2))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
662 op = par(xaxs = "i", yaxs = "i", mar = c(5, 2, 6, 10), las = 1)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
663 on.exit(par(op))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
664 plot(c(1, N1), c(1, N2), type = "n", xlab = "", ylab = "", axes = FALSE)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
665 if (!is.null(coord1)) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
666 segments(x0 = coord1[, 1], y0 = coord1[, 2], x1 = coord1[, 3], y1 = coord1[,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
667 4])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
668 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
669 if (!is.null(coord2)) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
670 segments(x0 = coord2[, 1], y0 = N2 - coord2[, 2], x1 = coord2[, 3], y1 = N2 -
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
671 coord2[, 4])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
672 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
673 abline(v = c(0, cumsum(L1)), col = "green")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
674 abline(h = c(0, cumsum(L2)), col = "green")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
675 box()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
676 axis(1, at = c(1, cumsum(L1))[-(length(L1) + 1)], labels = names(seq1), hadj = 0,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
677 cex.axis = 0.7)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
678 axis(4, at = c(1, cumsum(L2))[-(length(L2) + 1)], labels = names(seq2), hadj = 0,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
679 cex.axis = 0.7)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
680 invisible(list(coord1, coord2))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
681 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
682
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
683
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
684 CM2sequence = function(x) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
685 bases = c(colnames(x), "N")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
686 x = cbind(x, 0)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
687 colnames(x) = bases
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
688 seqs = paste(bases[apply(x, 1, which.max)], collapse = "")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
689
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
690 return(seqs)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
691 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
692
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
693 ## in alingment calculate significance from kmers
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
694 calculate_consensus_matrix = function(aln, tr_paths, G) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
695 bases = c("A", "C", "G", "T", "-")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
696 positions = lapply(strsplit(as.character(aln), split = ""), function(x) which(!x %in%
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
697 "-"))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
698 base_matrix = do.call(rbind, strsplit(as.character(aln), split = ""))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
699 weights = matrix(0, nrow = length(aln), ncol = nchar(aln))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
700 kmer_rel_proportions = (V(G)$size/table(factor(unlist(tr_paths), levels = 1:vcount(G))))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
701 for (i in seq_along(positions)) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
702 weights[i, positions[[i]]] = kmer_rel_proportions[tr_paths[[i]]]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
703 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
704 names(kmer_rel_proportions) = V(G)$name
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
705 ## get weights for gaps by approximation
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
706 fitgaps = function(y) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
707 if (sum(y == 0) == 0) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
708 return(y)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
709 } else {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
710 y0 = rep(y[y != 0], 3)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
711 x0 = which(rep(y, 3) != 0)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
712 fitted = approx(x0, y0, xout = seq_along(rep(y, 3)), rule = 1)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
713 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
714 return(fitted$y[-seq_along(y)][seq_along(y)])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
715 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
716 weights_with_gaps = t(apply(weights, 1, fitgaps))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
717 ## weights_with_gaps = get_gap_weights(weights, aln,G) ## this step take wey too
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
718 ## long time!!!-not so important TODO handle gaps differently - more effectively
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
719
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
720 ## get consensus matrix
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
721 CM = sapply(1:nchar(aln[[1]]), function(i) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
722 sapply(split(weights_with_gaps[, i], factor(base_matrix[, i], levels = bases)),
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
723 sum)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
724 })
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
725 return(t(CM)[, 1:5])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
726 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
727
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
728
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
729 get_gaps_from_alignment = function(aln) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
730 as.character(aln)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
731 gaps_positions = unique(do.call(rbind, str_locate_all(as.character(aln), "-+")))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
732 return(gaps_positions)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
733 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
734
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
735 plot_kmer_graph = function(G, L = NULL, vertex.size = NULL, ord = NULL, upto = NULL,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
736 highlight = NULL) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
737 if (!is.null(G$L) & is.null(L)) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
738 L = G$L
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
739 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
740 if (is.null(L)) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
741 ## L=layout.kamada.kawai(G)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
742 L = OGDFlayout(G)[[1]]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
743 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
744 clr = kmers2color(V(G)$name)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
745 if (!is.null(highlight)) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
746 clr[highlight] = "#00000080"
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
747 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
748 if (!is.null(ord)) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
749 clr[ord[1:upto]] = paste(clr[ord[1:upto]], "30", sep = "")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
750 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
751 if (is.null(vertex.size)) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
752 vertex.size = rescale(V(G)$size, to = c(0.5, 6))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
753
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
754 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
755 plot(G, layout = L, vertex.label = "", vertex.size = vertex.size, edge.curved = FALSE,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
756 vertex.color = clr, vertex.frame.color = "#00000020", edge.width = 2, edge.arrow.mode = 1,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
757 edge.arrow.size = 0.2)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
758
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
759 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
760
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
761 rglplot_kmer_graph = function(G, L = NULL, vertex.size = 4) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
762 if (is.null(L)) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
763 L = layout.kamada.kawai(G)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
764 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
765
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
766 rglplot(G, layout = L, vertex.label = "", vertex.size = vertex.size, edge.curved = FALSE,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
767 vertex.color = kmers2color(V(G)$name), edge.width = 2, edge.arrow.mode = 1,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
768 edge.arrow.size = 0.5)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
769 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
770
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
771
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
772
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
773
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
774 mafft = function(seqs, params = "--auto --thread 1 ") {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
775 if (length(seqs) < 2) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
776 return(seqs)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
777 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
778 infile = tempfile()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
779 if (class(seqs) == "character") {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
780 seqs = DNAStringSet(seqs)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
781 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
782 writeXStringSet(seqs, file = infile)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
783 outfile = tempfile()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
784 cmd = paste("mafft --quiet --nuc ", params, infile, "2> /dev/null > ", outfile)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
785 system(cmd, intern = TRUE, ignore.stderr = FALSE)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
786 aln = readDNAStringSet(outfile)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
787 unlink(c(outfile, infile))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
788 return(aln)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
789 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
790
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
791 mgblast = function(databaseSeq, querySeq) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
792 params = " -p 85 -W18 -UT -X40 -KT -JF -F \"m D\" -v100000000 -b100000000 -D4 -C 30 -D 30 "
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
793 # no dust filtering:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
794 paramsDF = " -p 85 -W18 -UT -X40 -KT -JF -F \"m D\" -v100000000 -b100000000 -D4 -F F"
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
795
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
796 database = tempfile()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
797 query = tempfile()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
798 output = tempfile()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
799 if (class(databaseSeq) == "character") {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
800 database = databaseSeq
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
801 do_not_delete_database = TRUE
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
802 } else {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
803 writeXStringSet(databaseSeq, filepath = database, format = "fasta")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
804 do_not_delete_database = FALSE
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
805 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
806 if (class(querySeq) == "character") {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
807 query = querySeq
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
808 do_not_delete_query = TRUE
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
809 } else {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
810 writeXStringSet(querySeq, filepath = query, format = "fasta")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
811 do_not_delete_query = FALSE
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
812 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
813 ## create database:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
814 cmd = paste("formatdb -i", database, "-p F")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
815 system(cmd)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
816 cmd = paste("mgblast", "-d", database, "-i", query, "-o", output, params)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
817 system(cmd)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
818 if (file.info(output)$size == 0) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
819 # no hist, try wthou dust masker
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
820 cmd = paste("mgblast", "-d", database, "-i", query, "-o", output, paramsDF)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
821 system(cmd)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
822 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
823 blastOut = read.table(output, sep = "\t", header = FALSE, as.is = TRUE, comment.char = "")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
824 unlink(output)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
825 if (!do_not_delete_query) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
826 unlink(query)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
827 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
828 if (!do_not_delete_database) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
829 unlink(paste(database, "*", sep = ""))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
830 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
831 colnames(blastOut) = c("query", "q.length", "q.start", "q.end", "subject", "s.length",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
832 "s.start", "s.end", "pid", "weight", "e.value", "strand")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
833 blastOut
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
834 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
835
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
836
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
837 estimate_sample_size = function(NV, NE, maxv, maxe) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
838 ## density
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
839 d = (2 * NE)/(NV * (NV - 1))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
840 eEst = (maxv * (maxv - 1) * d)/2
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
841 nEst = (d + sqrt(d^2 + 8 * d * maxe))/(2 * d)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
842 if (eEst >= maxe) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
843 N = round(nEst)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
844 E = round((N * (N - 1) * d)/2)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
845
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
846 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
847 if (nEst >= maxv) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
848 N = maxv
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
849 E = round((N * (N - 1) * d)/2)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
850
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
851 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
852 return(N)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
853 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
854
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
855
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
856
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
857
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
858 mgblast2graph = function(blastfile, seqfile, graph_destination, directed_graph_destination,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
859 oriented_sequences, paired = TRUE, repex = FALSE, image_file = NULL, image_file_tmb = NULL,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
860 include_layout = TRUE, pair_completeness = NULL, satellite_model_path = NULL,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
861 maxv = 40000, maxe = 5e+08, seqfile_full = seqfile) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
862 cat("loading blast results\n")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
863 if (repex) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
864 cln = c("query", "subject", "weight", "q.length", "q.start", "q.end", "s.length", "s.start",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
865 "s.end", "sign")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
866 colcls = c("character", "character", "numeric", "numeric", "numeric", "numeric",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
867 "numeric", "numeric", "numeric", "NULL", "NULL", "character")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
868
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
869 } else {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
870 cln = c("query", "q.length", "q.start", "q.end", "subject", "s.length", "s.start",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
871 "s.end","weight", "sign")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
872 colcls = c("character", "numeric", "numeric", "numeric", "character", "numeric",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
873 "numeric", "numeric", "NULL", "numeric", "NULL", "character")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
874 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
875 if (class(blastfile) == "data.frame") {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
876 blastTable = blastfile
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
877 colnames(blastTable)[12] = "sign"
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
878 } else {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
879 blastTable = read.table(blastfile, sep = "\t", as.is = TRUE, header = FALSE,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
880 colClasses = colcls, comment.char = "")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
881 colnames(blastTable) = cln
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
882 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
883 ## check for duplicates!
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
884 key = with(blastTable, ifelse(query > subject, paste(query, subject), paste(subject,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
885 query)))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
886 if (any(duplicated(key))) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
887 blastTable = blastTable[!duplicated(key), ]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
888 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
889 seqs = readDNAStringSet(seqfile)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
890 ## calculate pair completeness for reads before sampling
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
891 if (is.null(pair_completeness)) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
892 if (paired) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
893 if (seqfile_full != seqfile) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
894 seqs_full = readDNAStringSet(seqfile_full)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
895 pair_counts = tabulate(table(gsub(".$", "", names(seqs_full))))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
896 rm(seqs_full)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
897 } else {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
898 pair_counts = tabulate(table(gsub(".$", "", names(seqs))))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
899
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
900 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
901 pair_completeness = 1 - pair_counts[1]/sum(pair_counts)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
902 }else{
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
903 pair_completeness = 0
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
904 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
905 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
906 NV = length(seqs) # vertices
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
907 NE = nrow(blastTable) # nodes
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
908 if (maxv < NV | maxe < NE) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
909 ## Sample if graph is large
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
910 V_sample_size = estimate_sample_size(NV, NE, maxv, maxe)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
911 seqs = sample(seqs, V_sample_size)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
912 blastTable = blastTable[blastTable$query %in% names(seqs) & blastTable$subject %in%
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
913 names(seqs), ]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
914 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
915
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
916 blastTable$sign = ifelse(blastTable$sign == "+", 1, -1)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
917 vnames = unique(c(blastTable$query, blastTable$subject))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
918 vindex = seq_along(vnames)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
919 names(vindex) = vnames
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
920 ## correct orientation
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
921 cat("creating graph\n")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
922 G = graph.data.frame(blastTable[, c("query", "subject", "weight", "sign")], directed = FALSE,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
923 vertices = vnames)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
924 if (include_layout) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
925 ## save temporarily modified blastTable if large
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
926 tmpB = tempfile()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
927 save(blastTable, file = tmpB)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
928 rm(blastTable)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
929 ##
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
930 cat("graph layout calculation ")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
931 if (ecount(G) > 2e+06) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
932 cat("using fruchterman reingold\n")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
933 L = layout.fruchterman.reingold(G, dim = 3)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
934 } else {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
935 cat("using OGDF & frucherman reingold\n")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
936 Ltmp = OGDFlayout(G, alg = c("fmmm"))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
937 L = cbind(Ltmp[[1]][, 1:2], layout.fruchterman.reingold(G, dim = 2))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
938
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
939 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
940
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
941 GL = list(G = G, L = L)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
942 save(GL, file = graph_destination)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
943 if (!is.null(image_file)) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
944 cat("exporting graph figure")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
945 png(image_file, width = 900, height = 900, pointsize = 20)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
946 plot(GL$G, layout = GL$L[, 1:2], vertex.size = 2, vertex.color = "#000000A0",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
947 edge.color = "#00000040", vertex.shape = "circle", edge.curved = FALSE,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
948 vertex.label = NA, vertex.frame.color = NA, edge.width = 1)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
949 dev.off()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
950 ## create thunmbs:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
951 system(paste("convert ", image_file, " -resize 100x100 ", image_file_tmb))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
952
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
953 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
954 rm(GL)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
955 gc()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
956 load(tmpB)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
957 unlink(tmpB)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
958 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
959
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
960 if (!is.connected(G)) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
961 cat("Warning - graph is not connected\n")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
962 cc = clusters(G, "weak")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
963 mb = as.numeric(factor(membership(cc), levels = as.numeric(names(sort(table(membership(cc)),
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
964 decreasing = TRUE)))))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
965 selvi = which(mb == 1) # largest component
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
966 cat("using largest component: \nsize =", round(length(selvi)/vcount(G) *
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
967 100, 1), "% ;", length(selvi), "reads", "\n")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
968 G = induced.subgraph(G, vids = selvi)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
969 blastTable = blastTable[blastTable$query %in% V(G)$name & blastTable$subject %in%
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
970 V(G)$name, ]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
971 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
972
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
973 Gmst = minimum.spanning.tree(G)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
974 # create alternative trees - for case that unly suboptima solution is found
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
975 set.seed(123)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
976 Gmst_alt = list()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
977 Gmst_alt[[1]] = Gmst
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
978 for (i in 2:6){
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
979 E(G)$weight = runif(ecount(G), 0.1,1)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
980 Gmst_alt[[i]] = minimum.spanning.tree(G)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
981 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
982
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
983 rm(G)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
984 gc()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
985 ## six attempts to reorient reads
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
986 flip_names_all = list()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
987 prop_of_notfit=numeric()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
988 thr_fit=0.001
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
989 for (ii in 1:6){
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
990 Gmst = Gmst_alt[[ii]]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
991
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
992
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
993 blastTable_mst = as.data.frame(get.edgelist(Gmst, names = TRUE))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
994 colnames(blastTable_mst) = c("query", "subject")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
995 blastTable_mst$sign = E(Gmst)$sign
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
996
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
997 d = dfs(Gmst, root = 1, father = TRUE, order.out = TRUE)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
998 esign = E(Gmst)$sign
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
999 rc = rep(FALSE, vcount(Gmst))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1000 j = 0
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1001 p = 0
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1002 for (v in d$order[-1]) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1003 j = j + 1
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1004 f = as.numeric(d$father)[v]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1005 if (is.na(f)) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1006 next
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1007 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1008 eid = get.edge.ids(Gmst, c(v, f))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1009 if (esign[eid] == -1) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1010 ie = unique(c(which(blastTable_mst$query %in% V(Gmst)$name[v]), which(blastTable_mst$subject %in%
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1011 V(Gmst)$name[v]))) # incident edges
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1012 esign[ie] = esign[ie] * -1
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1013 rc[v] = TRUE
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1014 p = p + 1
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1015 if (p > 50) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1016 p = 0
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1017 cat("\r")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1018 cat(j, " : ", sum(esign)/length(esign))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1019 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1020 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1021 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1022 cat("\t")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1023 flip_names_all[[ii]] = flip_names = V(Gmst)$name[rc]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1024
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1025 if (!exists("blastTable")) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1026 load(tmpB)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1027 unlink(tmpB)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1028 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1029 ## add nofit later!!
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1030 s.mean = with(blastTable, (s.start + s.end)/2)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1031 s.mean_corrected = ifelse(blastTable$subject %in% flip_names, blastTable$s.length -
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1032 s.mean, s.mean)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1033 q.mean = with(blastTable, (q.start + q.end)/2)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1034 q.mean_corrected = ifelse(blastTable$query %in% flip_names, blastTable$q.length -
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1035 q.mean, q.mean)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1036 V1 = ifelse(s.mean_corrected > q.mean_corrected, blastTable$subject, blastTable$query)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1037 V2 = ifelse(s.mean_corrected > q.mean_corrected, blastTable$query, blastTable$subject)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1038 sign_final = ifelse(V1 %in% flip_names, -1, 1) * ifelse(V2 %in% flip_names, -1,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1039 1) * blastTable$sign
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1040 nofit = unique(c(V1[sign_final == "-1"], V2[sign_final == "-1"]))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1041 prop_of_notfit[ii] = length(nofit)/vcount(Gmst)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1042 ## check if correctly oriented
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1043 cat("prop notfit", prop_of_notfit[[ii]],"\n")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1044 if (prop_of_notfit[[ii]]<thr_fit){
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1045 ## OK
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1046 break
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1047 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1048 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1049 if (!prop_of_notfit[[ii]]<thr_fit){
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1050 ## get best solution
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1051 ii_best = which.min(prop_of_notfit)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1052 if (ii != ii_best){ # if the last solution was not best, get the best
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1053 flip_names = flip_names_all[[ii_best]]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1054 s.mean = with(blastTable, (s.start + s.end)/2)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1055 s.mean_corrected = ifelse(blastTable$subject %in% flip_names, blastTable$s.length -
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1056 s.mean, s.mean)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1057 q.mean = with(blastTable, (q.start + q.end)/2)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1058 q.mean_corrected = ifelse(blastTable$query %in% flip_names, blastTable$q.length -
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1059 q.mean, q.mean)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1060 V1 = ifelse(s.mean_corrected > q.mean_corrected, blastTable$subject, blastTable$query)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1061 V2 = ifelse(s.mean_corrected > q.mean_corrected, blastTable$query, blastTable$subject)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1062 sign_final = ifelse(V1 %in% flip_names, -1, 1) * ifelse(V2 %in% flip_names, -1,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1063 1) * blastTable$sign
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1064 nofit = unique(c(V1[sign_final == "-1"], V2[sign_final == "-1"]))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1065
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1066 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1067 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1068
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1069 ## exclude all nofit
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1070
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1071 df2 = data.frame(V1, V2, sign = blastTable$sign, sign_final = sign_final, stringsAsFactors = FALSE)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1072 rm(blastTable)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1073 gc()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1074 vertices = data.frame(name = vnames, reverse_complement = vnames %in% flip_names)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1075 G = graph.data.frame(df2, vertices = vertices)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1076 vcount_ori = vcount(G)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1077 G = induced.subgraph(G, vids = which(!V(G)$name %in% nofit))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1078
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1079 G$escore_mst = sum(esign)/length(esign)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1080 G$escore = sum(sign_final == 1)/length(sign_final)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1081 cc = clusters(G, "strong")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1082 mb = as.numeric(factor(membership(cc), levels = as.numeric(names(sort(table(membership(cc)),
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1083 decreasing = TRUE)))))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1084 names(mb) = V(G)$name
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1085 G$loop_index = max(cc$csize)/vcount(G)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1086 G$coverage = vcount(G)/vcount_ori
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1087 ## check sign in all edges
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1088 V(G)$membership = mb
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1089 save(G, file = directed_graph_destination)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1090 ## remove nofit
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1091 seqs = seqs[!names(seqs) %in% nofit]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1092 seqs[names(seqs) %in% flip_names] = reverseComplement(seqs[names(seqs) %in% flip_names])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1093 names(seqs) = paste(names(seqs), mb[names(seqs)])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1094 writeXStringSet(seqs, filepath = oriented_sequences)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1095 ## calculate satellite probability
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1096 if (is.null(satellite_model_path)) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1097 pSAT = isSAT = NULL
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1098 } else {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1099 satellite_model = readRDS(satellite_model_path)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1100 pSAT = get_prob(G$loop_index, pair_completeness, model = satellite_model)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1101 isSAT = isSatellite(G$loop_index, pair_completeness, model = satellite_model)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1102 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1103
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1104 ## get larges cc
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1105 output = list(escore = G$escore, coverage = G$coverage, escore_mts = G$escore_mst,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1106 loop_index = G$loop_index, pair_completeness = pair_completeness, graph_file = graph_destination,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1107 oriented_sequences = oriented_sequences, vcount = vcount(G), ecount = ecount(G),
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1108 satellite_probability = pSAT, satellite = isSAT)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1109 ## clean up
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1110 all_objects = ls()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1111 do_not_remove = "output"
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1112 rm(list = all_objects[!(all_objects %in% do_not_remove)])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1113 gc(verbose = FALSE, reset = TRUE)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1114 return(list2dictionary(output))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1115 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1116
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1117 list2dictionary = function(l){
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1118 dict = "{"
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1119 q='"'
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1120 for (i in 1:length(l)){
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1121 if (class(l[[i]])=="character" | is.null(l[[i]])){
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1122 q2 = "'''"
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1123 }else{
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1124 q2 = ''
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1125 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1126 dict = paste0(
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1127 dict,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1128 q,names(l)[i],q,":",q2, l[[i]], q2,", "
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1129 )
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1130 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1131 dict = paste0(dict, "}")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1132 return(dict)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1133 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1134
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1135 wrap = Vectorize(function(s, width = 80) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1136 i1 = seq(1, nchar(s), width)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1137 i2 = seq(width, by = width, length.out = length(i1))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1138 return(paste(substring(s, i1, i2), collapse = "\n"))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1139 })
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1140
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1141
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1142 tarean = function(input_sequences, output_dir, min_kmer_length = 11, max_kmer_length = 27,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1143 CPU = 2, sample_size = 10000, reorient_reads = TRUE, tRNA_database_path = NULL,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1144 include_layout = TRUE, paired = TRUE, lock_file=NULL) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1145 options(CPU = CPU)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1146 time0 = Sys.time()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1147 dir.create(output_dir)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1148 input_sequences_copy = paste(output_dir, "/", basename(input_sequences), sep = "")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1149
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1150 if (!file.copy(input_sequences, input_sequences_copy, overwrite = TRUE)) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1151 cat(paste("cannot copy", input_sequences, " to", output_dir), "\n")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1152 stop()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1153 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1154
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1155 lock_file = waitForRAM(lock_file = lock_file)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1156 pair_completeness = NULL
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1157 ## sampling
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1158 if (sample_size != 0) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1159 s = readDNAStringSet(input_sequences_copy)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1160 N = length(s)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1161 ## pair completness must be calculated before sampling!
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1162 if (N > sample_size) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1163 writeXStringSet(sample(s, sample_size), filepath = input_sequences_copy)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1164 if (paired) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1165 pair_counts = tabulate(table(gsub(".$", "", names(s))))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1166 pair_completeness = 1 - pair_counts[1]/sum(pair_counts)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1167 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1168 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1169 rm(s)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1170 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1171
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1172 if (reorient_reads) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1173 input_sequences_oriented = paste0(input_sequences_copy, "oriented.fasta")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1174 graph_file = paste0(input_sequences_copy, "_graph.RData")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1175 GLfile = paste0(input_sequences_copy, "_graph.GL")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1176 cat("reorientig sequences\n")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1177
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1178 blastTable = mgblast(input_sequences_copy, input_sequences_copy)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1179
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1180 graph_info = mgblast2graph(blastTable, input_sequences_copy, GLfile, graph_file,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1181 input_sequences_oriented, include_layout = include_layout, paired = paired,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1182 pair_completeness = pair_completeness)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1183
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1184 ## interupt if it does not look like tandem repeat at all # soft threshold!
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1185 if (is.null(graph_info$pair_completeness)) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1186 if (graph_info$loop_index <= 0.4) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1187 cat("CLUSTER DID NOT PASS THRESHOLD!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1188 return(list2dictionary(list(graph_info = graph_info)))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1189 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1190 } else {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1191 ## for paired reads:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1192 if (graph_info$loop_index < 0.7 | graph_info$pair_completeness < 0.4) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1193 cat("CLUSTER DID NOT PASS THRESHOLD!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1194 return(list2dictionary(list(graph_info = graph_info)))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1195 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1196 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1197
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1198 } else {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1199 input_sequences_oriented = input_sequences_copy
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1200 graph_info = NULL
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1201 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1202
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1203
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1204
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1205 kmer_counts = list()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1206 kmer_lengths = seq(min_kmer_length, max_kmer_length, by = 4)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1207 for (i in kmer_lengths) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1208 ## use pythonis.null(l[[i]])) function - faster
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1209 cmd = paste(script.dir, "/kmer_counting.py ", input_sequences_oriented, " ",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1210 i, sep = "")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1211 cat("calculation kmers of length ", i, "\n")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1212 f = system(cmd, intern = TRUE)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1213 x = read.table(f, as.is = TRUE, sep = "\t")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1214 kmer_counts[[as.character(i)]] = data.frame(x, freq = x$V2/sum(x$V2))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1215 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1216
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1217 ## number of kmers:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1218 N50 = sapply(kmer_counts, function(x) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1219 sum(cumsum(x$freq) < 0.5)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1220 })
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1221
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1222 N70 = sapply(kmer_counts, function(x) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1223 sum(cumsum(x$freq) < 0.7)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1224 })
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1225
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1226
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1227 time1 = Sys.time()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1228 ggmin = mclapply(kmer_counts, get_mimimal_cc, start = 0.5, mc.cores = CPU)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1229 time2 = Sys.time()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1230 cat("kmers graphs created ")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1231 print(time2 - time1)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1232 names(ggmin) = names(kmer_counts)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1233
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1234
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1235 ## estimate monomer
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1236 monomers = mclapply(ggmin, estimate_monomer, mc.cores = CPU)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1237
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1238 names(monomers) = names(kmer_counts)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1239 monomers = monomers[!sapply(monomers, is.null)]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1240 ## error handling:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1241 error = sapply(monomers, class) == "try-error"
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1242 if (any(error)) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1243 cat("\nError in monomers estimation: ")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1244 cat("calculation failed for monomers length ", names(monomers)[error], "\n")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1245 print(monomers[error])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1246 if (any(!error)) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1247 monomers = monomers[!error]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1248 } else {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1249 stop("monomer estimation failed")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1250 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1251 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1252
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1253 ## summary - make function!!
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1254 total_score = list()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1255 k = 0
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1256 length_best = numeric()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1257 score_bn = numeric()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1258 consensus = character()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1259
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1260 for (i in names(monomers)) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1261 for (v in seq_along(monomers[[i]]$estimates)) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1262 k = k + 1
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1263 total_score[[k]] = c(kmer = as.numeric(i), variant = v, total_score = monomers[[i]]$estimates[[v]]$total_score)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1264 score_bn[k] = min(rowSums(monomers[[i]]$estimates[[v]]$CM))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1265 length_best[k] = monomers[[i]]$estimates[[v]]$length_variant_score[1,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1266 1]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1267 consensus[[k]] = monomers[[i]]$estimates[[v]]$consensus2
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1268 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1269 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1270 summary_table = as.data.frame(do.call(rbind, total_score))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1271 summary_table$monomer_length = length_best
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1272 summary_table$consensus_length = nchar(consensus)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1273 summary_table$score_bn = score_bn
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1274 summary_table$consensus = paste("<pre>", wrap(consensus, width = 80), "<pre>",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1275 sep = "")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1276 consensus_DS = DNAStringSet(consensus)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1277 names(consensus_DS) = with(summary_table, paste0(kmer, "_", variant, "_sc_",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1278 signif(total_score), "_l_", monomer_length))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1279
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1280 ## filter by score - keep
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1281
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1282 ## reorder by score
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1283 consensus_DS = consensus_DS[order(summary_table$total_score, decreasing = TRUE)]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1284 summary_table = summary_table[order(summary_table$total_score, decreasing = TRUE),
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1285 ]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1286 rownames(summary_table) = NULL
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1287 N = nrow(summary_table)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1288 ## concatenate concensus(ie. make dimer head 2 tail) for pbs detection and other
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1289 ## make something like 'pseudo contig' multimer for mapping - min length 200 bp
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1290
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1291 ## searches
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1292 consensus_DS_dimer = DNAStringSet(paste0(consensus_DS, consensus_DS))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1293 tarean_contigs = DNAStringSet(sapply(consensus_DS,function(x)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1294 ifelse(nchar(x)<200,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1295 paste(rep(as.character(x),round(300/nchar(as.character(x))+1)),collapse=''),
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1296 as.character(x)))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1297 )
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1298
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1299 names(consensus_DS_dimer) = names(consensus_DS)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1300 # save sequences:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1301 consensus_DS_dimer_file = paste0(output_dir, "/consensus_dimer.fasta")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1302 consensus_DS_file = paste0(output_dir, "/consensus.fasta")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1303 tarean_contig_file = paste0(output_dir, "/tarean_contigs.fasta")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1304 writeXStringSet(consensus_DS, consensus_DS_file)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1305 writeXStringSet(tarean_contigs, tarean_contig_file)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1306 writeXStringSet(consensus_DS_dimer, consensus_DS_dimer_file)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1307 if (is.null(tRNA_database_path)) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1308 pbs_score = -1
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1309 } else {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1310 pbs_blast_table = paste0(output_dir, "/pbs_detection")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1311 pbs_score = detect_pbs(dimers_file = consensus_DS_dimer_file, tRNA_database_path = tRNA_database_path,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1312 reads = input_sequences_oriented, output = pbs_blast_table)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1313 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1314 ## search of open reading frames get the length of the longest
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1315 orf_l = max_ORF_length(consensus_DS_dimer)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1316
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1317
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1318 dir.create(paste0(output_dir, "/img"), recursive = TRUE)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1319 summary_table$monomer_length_graph = numeric(N)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1320 summary_table$graph_image = character(N)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1321 summary_table$monomer_length_logo = numeric(N)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1322 summary_table$logo_image = character(N)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1323 ## export graph nd consensus estimate to cluster directory type of output may
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1324 ## change in future
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1325 save(ggmin, file = paste0(output_dir, "/ggmin.RData"))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1326 save(monomers, file = paste0(output_dir, "/monomers.RData"))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1327
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1328 cat("generating HTML output\n")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1329 for (i in 1:N) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1330 kmer = as.character(summary_table$kmer[i])
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1331 variant = summary_table$variant[i]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1332 ## export graph
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1333 fout_link = paste0("img/graph_", kmer, "mer_", variant, ".png")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1334 fout = paste0(output_dir, "/", fout_link)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1335 ## summary_table$monomer_length_graph[i] = summary_table$monomer_length[i]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1336 ## summary_table$monomer_length_logo[[i]] = nrow(monomers[[kmer]]$estimates[[variant]]$CM)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1337 summary_table$monomer_length[[i]] = length(monomers[[kmer]]$estimates[[variant]]$consensus)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1338
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1339 if (i <= 10) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1340 png(fout, width = 800, height = 800)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1341 plot_kmer_graph(ggmin[[kmer]], highlight = unlist(monomers[[kmer]]$paths[[variant]]$tr_paths))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1342 dev.off()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1343 summary_table$graph_image[i] = hwriteImage(fout_link, link = fout_link,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1344 table = FALSE, width = 100, height = 100)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1345 ## export logo
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1346 png_link = paste0("img/logo_", kmer, "mer_", variant, ".png")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1347 fout = paste0(output_dir, "/", png_link)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1348 png(fout, width = 1200, height = round(summary_table$monomer_length[i] *
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1349 1) + 550)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1350 try(plot_multiline_logo(monomers[[kmer]]$estimates[[variant]]$CM, W = 100))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1351 dev.off()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1352 ## export corresponding position probability matrices
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1353 ppm_file = paste0(output_dir, '/ppm_', kmer, "mer_", variant, ".csv")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1354 ppm_link = paste0('ppm_', kmer, "mer_", variant, ".csv")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1355 write.table(monomers[[kmer]]$estimates[[variant]]$CM,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1356 file = ppm_file,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1357 col.names = TRUE, quote = FALSE,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1358 row.names = FALSE, sep="\t")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1359 summary_table$logo_image[i] = hwriteImage(png_link, link = ppm_link,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1360 table = FALSE, width = 200, height = 100)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1361 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1362
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1363 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1364
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1365 ## html_report = HTMLReport()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1366
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1367 htmlfile = paste0(output_dir, "/report.html")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1368 cat(htmlheader, file = htmlfile)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1369 included_columns = c('kmer', 'variant', 'total_score', 'consensus_length','consensus', 'graph_image', 'logo_image')
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1370 summary_table_clean = summary_table[,included_columns]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1371 colnames(summary_table_clean) = c('k-mer length',
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1372 'Variant index',
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1373 'k-mer coverage score',
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1374 'Consensus length',
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1375 'Consensus sequence',
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1376 'k-mer based graph',
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1377 'Sequence logo')
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1378 HTML(summary_table_clean, file = htmlfile, sortableDF = TRUE)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1379 HTMLEndFile(file = htmlfile)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1380 time4 = Sys.time()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1381 print(time4 - time0)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1382 if (!is.null(lock_file)){
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1383 print("------removing-lock--------")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1384 removelock(lock_file)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1385 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1386
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1387 print(list(htmlfile = htmlfile, TR_score = summary_table$total_score[1],
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1388 TR_monomer_length = as.numeric(summary_table$consensus_length[1]),
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1389 TR_consensus = summary_table$consensus[1], pbs_score = pbs_score, graph_info = graph_info,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1390 orf_l = orf_l, tarean_contig_file = tarean_contig_file))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1391 return(list2dictionary(list(htmlfile = htmlfile, TR_score = summary_table$total_score[1],
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1392 TR_monomer_length = as.numeric(summary_table$consensus_length[1]),
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1393 TR_consensus = summary_table$consensus[1], pbs_score = pbs_score, graph_info = graph_info,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1394 orf_l = orf_l, tarean_contig_file = tarean_contig_file)))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1395 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1396
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1397
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1398 ## graph loop index stability
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1399 loop_index_instability = function(G) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1400 N = 50
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1401 s = seq(vcount(G), vcount(G)/10, length.out = N)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1402 p = seq(1, 0.1, length.out = N)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1403 li = numeric()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1404 for (i in seq_along(s)) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1405 print(i)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1406 gs = induced_subgraph(G, sample(1:vcount(G), s[i]))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1407 li[i] = max(clusters(gs, "strong")$csize)/vcount(gs)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1408 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1409 instability = lm(li ~ p)$coefficient[2]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1410 return(instability)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1411 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1412
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1413 isSatellite = function(x, y, model) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1414 p = get_prob(x, y, model)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1415 if (p > model$cutoff) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1416 return("Putative Satellite")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1417 } else {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1418 return("")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1419 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1420 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1421
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1422 get_prob = function(x, y, model) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1423 pm = model$prob_matrix
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1424 N = ncol(pm)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1425 i = round(x * (N - 1)) + 1
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1426 j = round(y * (N - 1)) + 1
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1427 p = pm[i, j]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1428 return(p)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1429 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1430
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1431
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1432 detectMemUsage = function() {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1433 con = textConnection(gsub(" +", " ", readLines("/proc/meminfo")))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1434 memInfo = read.table(con, fill = TRUE, row.names = 1)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1435 close(con)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1436 memUsage = 1 - (memInfo["MemFree", 1] + memInfo["Cached", 1])/memInfo["MemTotal",
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1437 1]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1438 return(memUsage)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1439 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1440
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1441
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1442 makelock<-function(lockfile,lockmsg,CreateDirectories=TRUE){
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1443 lockdir=dirname(lockfile)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1444 if(!file.exists(lockdir)){
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1445 if(CreateDirectories) dir.create(lockdir,recursive=TRUE)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1446 else stop("Lock Directory for lockfile ",lockfile," does not exist")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1447 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1448 if(missing(lockmsg)) lockmsg=paste(system('hostname',intern=TRUE),Sys.getenv("R_SESSION_TMPDIR"))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1449 if (file.exists(lockfile)) return (FALSE)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1450 # note the use of paste makes the message writing atomic
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1451 cat(paste(lockmsg,"\n",sep=""),file=lockfile,append=TRUE,sep="")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1452 firstline=readLines(lockfile,n=1)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1453 if(firstline!=lockmsg){
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1454 # somebody else got there first
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1455 return(FALSE)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1456 } else return(TRUE)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1457 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1458
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1459
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1460 removelock<-function(lockfile){
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1461 if(unlink(lockfile)!=0) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1462 warning("Unable to remove ",lockfile)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1463 return (FALSE)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1464 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1465 return (TRUE)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1466 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1467
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1468
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1469 waitForRAM = function(p = 0.5,lock_file=NULL) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1470 if (detectMemUsage() < p) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1471 return(NULL)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1472 ## check lock file:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1473 } else {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1474 cat("waiting for RAM \n")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1475 free_count = 0
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1476 while (TRUE) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1477 if (makelock(lock_file)){
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1478 print("---------locking--------")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1479 return(lock_file)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1480 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1481 if (detectMemUsage() < p) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1482 cat("RAM freed \n")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1483 return(NULL)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1484 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1485 Sys.sleep(5)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1486 if (evaluate_user_cpu_usage() == 'free'){
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1487 free_count = free_count + 1
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1488 }else{
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1489 free_count = 0
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1490 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1491 if (detectMemUsage() < 0.8 & free_count > 100){
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1492 cat("RAM not free but nothing else is running \n")
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1493 return(NULL)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1494 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1495 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1496 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1497 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1498
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1499 lsmem = function() {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1500 g = globalenv()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1501 out_all = envs = list()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1502 envs = append(envs, g)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1503 total_size = numeric()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1504 while (environmentName(g) != "R_EmptyEnv") {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1505 g <- parent.env(g)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1506 envs = append(envs, g)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1507 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1508 for (e in envs) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1509
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1510 obj = ls(envir = e)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1511 if (length(obj) == 0) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1512 break
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1513 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1514 obj.size = list()
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1515 for (i in obj) {
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1516 obj.size[[i]] = object.size(get(i, envir = e))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1517 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1518 out = data.frame(object = obj, size = unlist(obj.size), stringsAsFactors = FALSE)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1519 out = out[order(out$size, decreasing = TRUE), ]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1520 out_all = append(out_all, out)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1521 total_size = append(total_size, sum(out$size))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1522 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1523 return(list(objects = out_all, total_size = total_size))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1524 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1525
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1526 evaluate_user_cpu_usage = function(){
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1527 user = Sys.info()["user"]
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1528 a = sum(as.numeric (system(paste ("ps -e -o %cpu -u", user), intern = TRUE)[-1]))
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1529 s = substring (system(paste ("ps -e -o stat -u", user), intern = TRUE)[-1],1,1)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1530 if (a<5 & sum(s %in% 'D')==0 & sum(s%in% 'R')<2){
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1531 status = 'free'
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1532 }else{
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1533 status = 'full'
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1534 }
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1535 return(status)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
1536 }