| 
0
 | 
     1 #!/user/bin/env Rscript
 | 
| 
 | 
     2 
 | 
| 
 | 
     3 suppressPackageStartupMessages(library(igraph))
 | 
| 
 | 
     4 suppressPackageStartupMessages(library(parallel))
 | 
| 
 | 
     5 suppressPackageStartupMessages(library(Biostrings))
 | 
| 
 | 
     6 suppressPackageStartupMessages(library(scales))
 | 
| 
 | 
     7 suppressPackageStartupMessages(library(stringr))
 | 
| 
 | 
     8 suppressPackageStartupMessages(library(hwriter))
 | 
| 
 | 
     9 suppressPackageStartupMessages(library(R2HTML))
 | 
| 
 | 
    10 suppressPackageStartupMessages(library(plyr))
 | 
| 
 | 
    11 suppressPackageStartupMessages(library(dplyr))
 | 
| 
 | 
    12 
 | 
| 
 | 
    13 max_ORF_length = function(s) {
 | 
| 
 | 
    14   ## check all frames
 | 
| 
 | 
    15   L = 0
 | 
| 
 | 
    16   for (i in 1:3) {
 | 
| 
 | 
    17     L = max(L, nchar(unlist(strsplit(as.character(translate(subseq(s, i))), "*", 
 | 
| 
 | 
    18                                      fixed = TRUE))))
 | 
| 
 | 
    19     L = max(L, nchar(unlist(strsplit(as.character(translate(subseq(reverseComplement(s), 
 | 
| 
 | 
    20                                                                    i))), "*", fixed = TRUE))))
 | 
| 
 | 
    21   }
 | 
| 
 | 
    22   return(L)
 | 
| 
 | 
    23 }
 | 
| 
 | 
    24 
 | 
| 
 | 
    25 kmers2graph = function(kmers, mode = "strong", prop = NULL) {
 | 
| 
 | 
    26   kmerLength = nchar(kmers[1, 1])
 | 
| 
 | 
    27   if (ncol(kmers) == 2) {
 | 
| 
 | 
    28     kmers$size = kmers[, 2]/sum(kmers[, 2])
 | 
| 
 | 
    29   }
 | 
| 
 | 
    30   colnames(kmers) = c("name", "count", "size")
 | 
| 
 | 
    31   if (!is.null(prop)) {  # tohle se nepouziva(prop je null), a je to asi spatne - filtuje se to pred tridenim!!
 | 
| 
 | 
    32     p = cumsum(kmers$size)
 | 
| 
 | 
    33     kmers = kmers[p < prop, ]
 | 
| 
 | 
    34   }
 | 
| 
 | 
    35   N = dim(kmers)[1]
 | 
| 
 | 
    36   kmers = kmers[order(kmers$size), ]
 | 
| 
 | 
    37   ## convert kmers to fasta file
 | 
| 
 | 
    38   kms = data.frame(kmer = substring(kmers$name, 1, kmerLength - 1), ids = 1:nrow(kmers),stringsAsFactors = FALSE)
 | 
| 
 | 
    39   kme = data.frame(kmer = substring(kmers$name, 2), ide = 1:nrow(kmers), stringsAsFactors = FALSE)
 | 
| 
 | 
    40 
 | 
| 
 | 
    41   ## df = merge(kms,kme, by = 'kmer',all=FALSE)[,c(2,3,1)]
 | 
| 
 | 
    42   df = inner_join(kme,kms, by = 'kmer')[,c(2,3)]
 | 
| 
 | 
    43 
 | 
| 
 | 
    44   ## names(kms) = seq_along(kms)
 | 
| 
 | 
    45   ## kme = substring(kmers$name, 2)
 | 
| 
 | 
    46   ## names(kme) = seq_along(kme)
 | 
| 
 | 
    47   ## ## use new blast!
 | 
| 
 | 
    48   ## database = tempfile()
 | 
| 
 | 
    49   ## query = tempfile()
 | 
| 
 | 
    50   ## output = tempfile()
 | 
| 
 | 
    51   ## writeXStringSet(DNAStringSet(kms), filepath = database, format = "fasta")
 | 
| 
 | 
    52   ## writeXStringSet(DNAStringSet(kme), filepath = query, format = "fasta")
 | 
| 
 | 
    53   ## cmd = paste("makeblastdb -in", database, "-dbtype nucl")
 | 
| 
 | 
    54   ## system(cmd, ignore.stdout = TRUE)
 | 
| 
 | 
    55   ## cmd = paste("blastn -outfmt '6 qseqid sseqid pident'  -strand plus -dust no -perc_identity 100 -query ", 
 | 
| 
 | 
    56   ##     query, "-db", database, "-word_size", kmerLength - 1, "-out", output)
 | 
| 
 | 
    57   ## system(cmd)
 | 
| 
 | 
    58   ## df = try({
 | 
| 
 | 
    59   ##   read.table(output, as.is = TRUE)
 | 
| 
 | 
    60   ## })
 | 
| 
 | 
    61   ## if (class(df) == "try-error"){
 | 
| 
 | 
    62   ##   print("creation of kmer graph failed")
 | 
| 
 | 
    63   ##   print(query)
 | 
| 
 | 
    64   ##   print(output)
 | 
| 
 | 
    65   ##   print(database)
 | 
| 
 | 
    66   ##   return(NULL)
 | 
| 
 | 
    67   ## }
 | 
| 
 | 
    68   ## unlink(query)
 | 
| 
 | 
    69   ## unlink(paste(database, "*", sep = ""))
 | 
| 
 | 
    70   ## unlist(output)
 | 
| 
 | 
    71   gm_mean = function(x, na.rm = TRUE) {
 | 
| 
 | 
    72     exp(sum(log(x[x > 0]), na.rm = na.rm)/length(x))
 | 
| 
 | 
    73   }
 | 
| 
 | 
    74   
 | 
| 
 | 
    75   whg = apply(cbind(kmers[df[, 1], 2], V2 = kmers[df[, 2], 2]), 1, gm_mean)
 | 
| 
 | 
    76   G = graph.data.frame(data.frame(V1 = kmers$name[df[, 1]], V2 = kmers$name[df[, 
 | 
| 
 | 
    77                                                                                2]], weight = whg), vertices = kmers[, 1:3])
 | 
| 
 | 
    78                                         # separate to connected components:
 | 
| 
 | 
    79   ccs = clusters(G, mode = mode)$membership
 | 
| 
 | 
    80   sel_cls = which(tabulate(ccs) > 1)
 | 
| 
 | 
    81   Gs = list()
 | 
| 
 | 
    82   for (i in seq_along(sel_cls)) {
 | 
| 
 | 
    83     Gs[[i]] = induced.subgraph(G, vids = which(ccs %in% sel_cls[i]))
 | 
| 
 | 
    84   }
 | 
| 
 | 
    85   ## reorder!!!
 | 
| 
 | 
    86   Gs = Gs[order(sapply(Gs, vcount), decreasing = TRUE)]
 | 
| 
 | 
    87   return(Gs)
 | 
| 
 | 
    88 }
 | 
| 
 | 
    89 
 | 
| 
 | 
    90 
 | 
| 
 | 
    91 OGDFlayout = function(G, ncol = NULL, alg = "fmmm", OGDF = getOption("OGDF")) {
 | 
| 
 | 
    92   ## is ogdf binary available?
 | 
| 
 | 
    93   if (is.null(OGDF)) {
 | 
| 
 | 
    94     OGDF = Sys.getenv("OGDF")
 | 
| 
 | 
    95     if ("" == OGDF) {
 | 
| 
 | 
    96       options(warn = -1)
 | 
| 
 | 
    97       OGDF = system("which runOGDFlayout", intern = TRUE)
 | 
| 
 | 
    98       options(warn = 0)
 | 
| 
 | 
    99       if (length(OGDF) == 0) {
 | 
| 
 | 
   100         cat("path to runOGDFlayout not found\n")
 | 
| 
 | 
   101         return(NULL)
 | 
| 
 | 
   102       }
 | 
| 
 | 
   103       
 | 
| 
 | 
   104     }
 | 
| 
 | 
   105   }
 | 
| 
 | 
   106   if (is.null(ncol)) {
 | 
| 
 | 
   107     if (is.null(E(G)$weight)) {
 | 
| 
 | 
   108       el = data.frame(get.edgelist(G, names = TRUE), rep(1, ecount(G)))
 | 
| 
 | 
   109     } else {
 | 
| 
 | 
   110       el = data.frame(get.edgelist(G, names = TRUE), E(G)$weight)
 | 
| 
 | 
   111     }
 | 
| 
 | 
   112     ncol = paste(tempfile(pattern = as.character(Sys.getpid())), ".layout", sep = "")
 | 
| 
 | 
   113     write.table(el, file = ncol, row.names = FALSE, col.names = FALSE, sep = "\t", 
 | 
| 
 | 
   114                 quote = FALSE)
 | 
| 
 | 
   115   } else {
 | 
| 
 | 
   116                                         # copy ncol:
 | 
| 
 | 
   117     ncol_tmp = paste(tempfile(pattern = as.character(Sys.getpid())), ".layout", 
 | 
| 
 | 
   118                      sep = "")
 | 
| 
 | 
   119     file.copy(ncol, ncol_tmp)
 | 
| 
 | 
   120     ncol = ncol_tmp
 | 
| 
 | 
   121   }
 | 
| 
 | 
   122   algopt = c("fmmm", "sm", "fme")
 | 
| 
 | 
   123   if (!(alg %in% c("fmmm", "sm", "fme") && TRUE)) {
 | 
| 
 | 
   124     stop("alg must by :", algopt, "\n")
 | 
| 
 | 
   125     
 | 
| 
 | 
   126   }
 | 
| 
 | 
   127   
 | 
| 
 | 
   128                                         # output file:
 | 
| 
 | 
   129   Louts = list()
 | 
| 
 | 
   130   layout_file = tempfile(pattern = as.character(Sys.getpid()))
 | 
| 
 | 
   131   for (i in alg) {
 | 
| 
 | 
   132     cmd = paste(OGDF, "-alg", i, "-iff layout -off layout -out", layout_file, 
 | 
| 
 | 
   133                 ncol)
 | 
| 
 | 
   134     system(cmd, intern = TRUE)
 | 
| 
 | 
   135     L = read.table(layout_file, skip = ecount(G))
 | 
| 
 | 
   136     L = L[match(V(G)$name, L$V2), ]
 | 
| 
 | 
   137     Lout = as.matrix(L[, -(1:2)])
 | 
| 
 | 
   138     unlink(layout_file)
 | 
| 
 | 
   139     Louts[[i]] = Lout
 | 
| 
 | 
   140     
 | 
| 
 | 
   141   }
 | 
| 
 | 
   142                                         # clean up
 | 
| 
 | 
   143   unlink(ncol)
 | 
| 
 | 
   144   return(Louts)
 | 
| 
 | 
   145 }
 | 
| 
 | 
   146 
 | 
| 
 | 
   147 xcolor_code = c(A = "#FF0000", C = "#00FF00", T = "#0000FF", G = "#FFFF00")
 | 
| 
 | 
   148 
 | 
| 
 | 
   149 kmers2color = function(s, position = NULL) {
 | 
| 
 | 
   150   if (is.null(position)) {
 | 
| 
 | 
   151     position = round(nchar(s[1])/2, 0)
 | 
| 
 | 
   152     ## position = 1
 | 
| 
 | 
   153     position = nchar(s[1])
 | 
| 
 | 
   154   }
 | 
| 
 | 
   155   color_code = c(A = "#FF0000", C = "#00FF00", T = "#0000FF", G = "#FFFF00")
 | 
| 
 | 
   156   color_base = substring(s, position, position)
 | 
| 
 | 
   157   colors = color_code[color_base]
 | 
| 
 | 
   158   names(colors) = color_base
 | 
| 
 | 
   159   return(colors)
 | 
| 
 | 
   160 }
 | 
| 
 | 
   161 get_sequence = function(g, v, position = NULL) {
 | 
| 
 | 
   162   s = V(g)$name
 | 
| 
 | 
   163   if (is.null(position)) {
 | 
| 
 | 
   164     position = round(nchar(s[1])/2, 0)
 | 
| 
 | 
   165     ## position = 1
 | 
| 
 | 
   166     position = nchar(s[1])
 | 
| 
 | 
   167   }
 | 
| 
 | 
   168   nt = paste(substring(s[v], position, position), collapse = "")
 | 
| 
 | 
   169   return(nt)
 | 
| 
 | 
   170 }
 | 
| 
 | 
   171 
 | 
| 
 | 
   172 
 | 
| 
 | 
   173 get_mimimal_cc = function(km, thr = 20, min_coverage = 0.45, step = 2, start = NULL) {
 | 
| 
 | 
   174   if (is.null(start)) {
 | 
| 
 | 
   175     i = sum(cumsum(km$freq) < 0.5)
 | 
| 
 | 
   176   } else {
 | 
| 
 | 
   177     i = sum(cumsum(km$freq) < start)
 | 
| 
 | 
   178   }
 | 
| 
 | 
   179   continue = TRUE
 | 
| 
 | 
   180   while (continue) {
 | 
| 
 | 
   181     if (i > nrow(km)) {
 | 
| 
 | 
   182       i = nrow(km)
 | 
| 
 | 
   183       continue = FALSE
 | 
| 
 | 
   184       step = 1
 | 
| 
 | 
   185     }
 | 
| 
 | 
   186     GG = kmers2graph(km[1:i, ])
 | 
| 
 | 
   187     if (length(GG) > 0) {
 | 
| 
 | 
   188       if (vcount(GG[[1]]) > thr) {
 | 
| 
 | 
   189         if (sum(V(GG[[1]])$size) >= min_coverage) {
 | 
| 
 | 
   190           GG[[1]]$input_coverage = sum(km$freq[1:i])
 | 
| 
 | 
   191           GG[[1]]$L = OGDFlayout(GG[[1]])[[1]]
 | 
| 
 | 
   192           return(GG[[1]])
 | 
| 
 | 
   193         }
 | 
| 
 | 
   194       }
 | 
| 
 | 
   195     }
 | 
| 
 | 
   196     i = round(i * step)
 | 
| 
 | 
   197     
 | 
| 
 | 
   198   }
 | 
| 
 | 
   199   if (length(GG) == 0 | is.null(GG)) {
 | 
| 
 | 
   200     return(NULL)
 | 
| 
 | 
   201   }
 | 
| 
 | 
   202   
 | 
| 
 | 
   203   GG[[1]]$input_coverage = sum(km$freq[1:i])
 | 
| 
 | 
   204   GG[[1]]$L = OGDFlayout(GG[[1]])[[1]]
 | 
| 
 | 
   205   return(GG[[1]])
 | 
| 
 | 
   206 }
 | 
| 
 | 
   207 
 | 
| 
 | 
   208 
 | 
| 
 | 
   209 
 | 
| 
 | 
   210 paths2string = function(paths) {
 | 
| 
 | 
   211   pathstring = sapply(lapply(lapply(paths, as_ids), substring, 1, 1), paste, collapse = "")
 | 
| 
 | 
   212   return(pathstring)
 | 
| 
 | 
   213 }
 | 
| 
 | 
   214 
 | 
| 
 | 
   215 
 | 
| 
 | 
   216 align_paths = function(paths, G) {
 | 
| 
 | 
   217   shift = rep(NA, length(paths))
 | 
| 
 | 
   218   thr = 0  # minimal length
 | 
| 
 | 
   219   tr_paths = list()
 | 
| 
 | 
   220   Seqs = list()
 | 
| 
 | 
   221   centre_node = as.numeric(names(sort(table(unlist(paths)), decreasing = TRUE)))[[1]]
 | 
| 
 | 
   222   
 | 
| 
 | 
   223   for (i in seq_along(paths)) {
 | 
| 
 | 
   224     if (centre_node %in% paths[[i]]) {
 | 
| 
 | 
   225       S = which(paths[[i]] %in% centre_node)
 | 
| 
 | 
   226       shift[i] = S
 | 
| 
 | 
   227       if (S == 1) {
 | 
| 
 | 
   228         tr_paths[[i]] = paths[[i]]
 | 
| 
 | 
   229       } else {
 | 
| 
 | 
   230         tr_paths[[i]] = c(paths[[i]][S:length(paths[[i]])], paths[[i]][1:(S - 
 | 
| 
 | 
   231                                                                           1)])
 | 
| 
 | 
   232       }
 | 
| 
 | 
   233       Seqs[[i]] = get_sequence(G, tr_paths[[i]])
 | 
| 
 | 
   234     } else {
 | 
| 
 | 
   235       shift[i] = NA
 | 
| 
 | 
   236     }
 | 
| 
 | 
   237   }
 | 
| 
 | 
   238   paths_n = lapply(paths, as.numeric)
 | 
| 
 | 
   239   tr_paths_n = do.call(cbind, lapply(tr_paths, as.numeric))
 | 
| 
 | 
   240   new_shift = shift
 | 
| 
 | 
   241   for (i in which(is.na(shift))) {
 | 
| 
 | 
   242     score = numeric(length(paths_n))
 | 
| 
 | 
   243     for (S in seq_along(paths_n[[i]])) {
 | 
| 
 | 
   244       if (S == 1) {
 | 
| 
 | 
   245         path_tmp_n = paths_n[[i]]
 | 
| 
 | 
   246       } else {
 | 
| 
 | 
   247         path_tmp_n = c(paths_n[[i]][S:length(paths_n[[i]])], paths_n[[i]][1:(S - 
 | 
| 
 | 
   248                                                                              1)])
 | 
| 
 | 
   249       }
 | 
| 
 | 
   250       score[S] = sum(tr_paths_n == path_tmp_n)
 | 
| 
 | 
   251     }
 | 
| 
 | 
   252     if (sum(score) != 0) {
 | 
| 
 | 
   253       S = which.max(score)
 | 
| 
 | 
   254       new_shift[i] = S
 | 
| 
 | 
   255       if (S == 1) {
 | 
| 
 | 
   256         tr_paths[[i]] = paths[[i]]
 | 
| 
 | 
   257       } else {
 | 
| 
 | 
   258         tr_paths[[i]] = c(paths[[i]][S:length(paths[[i]])], paths[[i]][1:(S - 
 | 
| 
 | 
   259                                                                           1)])
 | 
| 
 | 
   260       }
 | 
| 
 | 
   261       Seqs[[i]] = get_sequence(G, tr_paths[[i]])
 | 
| 
 | 
   262     }
 | 
| 
 | 
   263   }
 | 
| 
 | 
   264   shift = new_shift
 | 
| 
 | 
   265                                         # try to shift based on the sequences itself
 | 
| 
 | 
   266   return(list(Seqs = Seqs[!is.na(shift)], tr_paths = tr_paths[!is.na(shift)], shift = shift[!is.na(shift)]))
 | 
| 
 | 
   267 }
 | 
| 
 | 
   268 
 | 
| 
 | 
   269 make_consensus = function(paths_info, G) {
 | 
| 
 | 
   270   include = !is.na(paths_info$shift)
 | 
| 
 | 
   271   ## make alignments using mafft
 | 
| 
 | 
   272   aln = mafft(unlist(paths_info$Seqs[include]))
 | 
| 
 | 
   273   CM = calculate_consensus_matrix(aln = aln, tr_paths = paths_info$tr_paths[include], 
 | 
| 
 | 
   274                                   G = G)
 | 
| 
 | 
   275   gaps = get_gaps_from_alignment(aln)
 | 
| 
 | 
   276   CMnorm = CM/rowSums(CM)
 | 
| 
 | 
   277   bases = colnames(CM)
 | 
| 
 | 
   278   consensus = sapply(apply(CMnorm, 1, function(x) bases[which(x > 0.2)][order(x[x > 
 | 
| 
 | 
   279                                                                                 0.2], decreasing = TRUE)]), paste, collapse = "")
 | 
| 
 | 
   280   consensus2 = gsub("-", "", paste0(substring(consensus, 1, 1), collapse = ""), 
 | 
| 
 | 
   281                     fixed = TRUE)
 | 
| 
 | 
   282   number_of_SNP = sum(rowSums(CM > 0) > 1)
 | 
| 
 | 
   283   SNP_positions = which(rowSums(CM > 0) > 1)
 | 
| 
 | 
   284   number_of_position_with_indels = sum(colSums(do.call(rbind, strsplit(as.character(aln), 
 | 
| 
 | 
   285                                                                        "")) == "-") > 0)
 | 
| 
 | 
   286   indel_positions = which(colSums(do.call(rbind, strsplit(as.character(aln), "")) == 
 | 
| 
 | 
   287                                   "-") > 0)
 | 
| 
 | 
   288   if (length(SNP_positions) > 0) {
 | 
| 
 | 
   289     variable_sites = unlist(c(c(mapply(paste, strsplit((sapply(apply(CMnorm, 
 | 
| 
 | 
   290                                                                      1, function(x) bases[which(x > 0.2)]), paste, collapse = ""))[SNP_positions], 
 | 
| 
 | 
   291                                                        ""), SNP_positions, sep = "_")), paste("-", indel_positions, sep = "_")))
 | 
| 
 | 
   292   } else {
 | 
| 
 | 
   293     variable_sites = NULL
 | 
| 
 | 
   294   }
 | 
| 
 | 
   295   variable_positions = unique(SNP_positions, indel_positions)
 | 
| 
 | 
   296   return(list(aln = aln, CM = CM, CMnorm = CMnorm, consensus = consensus, consensus2 = consensus2, 
 | 
| 
 | 
   297               number_of_SNP = number_of_SNP, SNP_positions = SNP_positions, number_of_position_with_indels = number_of_position_with_indels, 
 | 
| 
 | 
   298               indel_positions = indel_positions, variable_positions = variable_positions, 
 | 
| 
 | 
   299               variable_sites = variable_sites, gaps = gaps))
 | 
| 
 | 
   300 }
 | 
| 
 | 
   301 
 | 
| 
 | 
   302 
 | 
| 
 | 
   303 estimate_monomer = function(G, weights = NULL, limit = NULL) {
 | 
| 
 | 
   304   if (is.null(G)) {
 | 
| 
 | 
   305     return(NULL)
 | 
| 
 | 
   306   }
 | 
| 
 | 
   307   ## estimate monomer from kmer based graph
 | 
| 
 | 
   308   V(G)$id = 1:vcount(G)
 | 
| 
 | 
   309   GS = induced_subgraph(G, vids = which(degree(G) == 2))  ## keep only vertices without branching
 | 
| 
 | 
   310   cls = clusters(GS)$membership
 | 
| 
 | 
   311   
 | 
| 
 | 
   312   
 | 
| 
 | 
   313   ids = mapply(FUN = function(x, y) x[which.max(y)], split(V(GS)$id, cls), split(V(GS)$size, 
 | 
| 
 | 
   314                                                                                  cls))  ## from each branch use only one vertex with larges size!
 | 
| 
 | 
   315   
 | 
| 
 | 
   316   
 | 
| 
 | 
   317   ids = ids[order(V(G)$size[ids], decreasing = TRUE)]
 | 
| 
 | 
   318   ids_size = V(G)$size[ids]
 | 
| 
 | 
   319   N50 = sum(cumsum(ids_size)/sum(ids_size) < 0.5)
 | 
| 
 | 
   320   if (length(ids) > 10000) {
 | 
| 
 | 
   321     ids = ids[1:N50]
 | 
| 
 | 
   322   }
 | 
| 
 | 
   323   ## use only large vertices in search!  how many?
 | 
| 
 | 
   324   el = get.edgelist(G, names = FALSE)
 | 
| 
 | 
   325   node_use = numeric(vcount(G))
 | 
| 
 | 
   326   LL = numeric()
 | 
| 
 | 
   327   ## W=numeric()
 | 
| 
 | 
   328   i = 0
 | 
| 
 | 
   329   paths = list()
 | 
| 
 | 
   330   if (is.null(weights)) {
 | 
| 
 | 
   331     weights = (max(E(G)$weight) - (E(G)$weight) + 1)
 | 
| 
 | 
   332     weights = E(G)$weight^(-3)
 | 
| 
 | 
   333   }
 | 
| 
 | 
   334   included = rep(FALSE, vcount(G))
 | 
| 
 | 
   335   W_total = sum(V(G)$size)
 | 
| 
 | 
   336   
 | 
| 
 | 
   337   coverage = numeric()
 | 
| 
 | 
   338   t0 = c()
 | 
| 
 | 
   339   i = 0
 | 
| 
 | 
   340   j = 0
 | 
| 
 | 
   341   for (n in ids) {
 | 
| 
 | 
   342     j = j + 1
 | 
| 
 | 
   343     t0[j] = Sys.time()
 | 
| 
 | 
   344     if (included[n]) {
 | 
| 
 | 
   345       next
 | 
| 
 | 
   346     }
 | 
| 
 | 
   347     m = which(el[, 1] %in% n)
 | 
| 
 | 
   348     i = i + 1
 | 
| 
 | 
   349     s = get.shortest.paths(G, el[m, 2], el[m, 1], weights = weights, output = "vpath")
 | 
| 
 | 
   350     included[as.numeric(s$vpath[[1]])] = TRUE
 | 
| 
 | 
   351     paths[[i]] = s$vpath[[1]]
 | 
| 
 | 
   352     LL[i] = (length(s$vpath[[1]]))
 | 
| 
 | 
   353   }
 | 
| 
 | 
   354   
 | 
| 
 | 
   355   ## evaluate if paths should be divided to variants - by length and path weight
 | 
| 
 | 
   356   paths_clusters = split(paths, LL)
 | 
| 
 | 
   357   paths_clusters_tr = mclapply(paths_clusters, FUN = align_paths, G = G, mc.cores = getOption("CPU"))
 | 
| 
 | 
   358   
 | 
| 
 | 
   359   ## paths_clusters_tr = lapply(paths_clusters, FUN = align_paths, G = G) consensus
 | 
| 
 | 
   360   paths_consensus = mclapply(paths_clusters_tr, make_consensus, G = G, mc.cores = getOption("CPU"))
 | 
| 
 | 
   361   
 | 
| 
 | 
   362   ## evaluate weight for individual paths:
 | 
| 
 | 
   363   for (v in seq_along(paths_consensus)) {
 | 
| 
 | 
   364     p = paths_clusters_tr[[v]]$tr_paths
 | 
| 
 | 
   365     ## clean
 | 
| 
 | 
   366     p = p[!sapply(p, function(x) anyNA(x) | is.null(x))]
 | 
| 
 | 
   367     L = sapply(p, length)
 | 
| 
 | 
   368     p_groups = split(p, L)
 | 
| 
 | 
   369     w_groups = sapply(p_groups, function(x) sum(V(G)$size[unique(c(sapply(x, 
 | 
| 
 | 
   370                                                                           as.numeric)))]))
 | 
| 
 | 
   371     total_score = sum(V(G)$size[unique(c(unlist(sapply(p, as.numeric))))])
 | 
| 
 | 
   372     LW = data.frame(`Length estimate` = unique(L), weight = w_groups, stringsAsFactors = FALSE)
 | 
| 
 | 
   373     LW = LW[order(LW$weight, decreasing = TRUE), ]
 | 
| 
 | 
   374     rownames(LW) = NULL
 | 
| 
 | 
   375     paths_consensus[[v]]$total_score = total_score
 | 
| 
 | 
   376     paths_consensus[[v]]$length_variant_score = LW
 | 
| 
 | 
   377     
 | 
| 
 | 
   378   }
 | 
| 
 | 
   379   return(list(estimates = paths_consensus, paths = paths_clusters_tr))
 | 
| 
 | 
   380 }
 | 
| 
 | 
   381 
 | 
| 
 | 
   382 
 | 
| 
 | 
   383 
 | 
| 
 | 
   384 detect_pbs = function(dimers_file, tRNA_database_path, reads_file, output) {
 | 
| 
 | 
   385   ## read_file contain oriented reads!
 | 
| 
 | 
   386   min_offset = 10
 | 
| 
 | 
   387   max_end_dist = 2
 | 
| 
 | 
   388   min_aln_length = 30
 | 
| 
 | 
   389   max_pbs_search = 30
 | 
| 
 | 
   390   thr_length = 12
 | 
| 
 | 
   391   end_position = 23
 | 
| 
 | 
   392   insertion_proportion_threshold <- 0.15
 | 
| 
 | 
   393   ## read_file contain oriented reads!  for testing
 | 
| 
 | 
   394   if (FALSE) {
 | 
| 
 | 
   395     library(Biostrings)
 | 
| 
 | 
   396     thr_length = 12
 | 
| 
 | 
   397     end_position = 23
 | 
| 
 | 
   398     dimers_file = "consensus_dimer.fasta"
 | 
| 
 | 
   399     reads_file = "reads_oriented.fas"
 | 
| 
 | 
   400     tRNA_database_path = "/mnt/raid/users/petr/workspace/repex_tarean/databases/tRNA_database2.fasta"
 | 
| 
 | 
   401     
 | 
| 
 | 
   402   }
 | 
| 
 | 
   403   ## find read which are half aligned do reference dimer
 | 
| 
 | 
   404   dimers_seq = readDNAStringSet(dimers_file)
 | 
| 
 | 
   405   cmd = paste("makeblastdb -in", dimers_file, "-dbtype nucl")
 | 
| 
 | 
   406   system(cmd, ignore.stdout = TRUE)
 | 
| 
 | 
   407   output = paste0(reads_file, "blast_out.cvs")
 | 
| 
 | 
   408   columns = c("qseqid", "qstart", "qend", "qlen", "sseqid", "sstart", "send", "sstrand", 
 | 
| 
 | 
   409               "slen", "pident", "length")
 | 
| 
 | 
   410   cmd = paste("blastn -outfmt '6", paste(columns, collapse = " "), "' -perc_identity 90 -query ", 
 | 
| 
 | 
   411               reads_file, "-db", dimers_file, "-word_size", 7, "-dust no -num_alignments 99999 -strand plus ", 
 | 
| 
 | 
   412               "-out", output)
 | 
| 
 | 
   413   system(cmd)
 | 
| 
 | 
   414   blastdf = read.table(output, as.is = TRUE, col.names = columns, comment.char = "")
 | 
| 
 | 
   415   unlink(output)
 | 
| 
 | 
   416   blastdf = blastdf[blastdf$length >= min_aln_length, ]
 | 
| 
 | 
   417   
 | 
| 
 | 
   418   ## expand two whole read to see unligned reads parts
 | 
| 
 | 
   419   blastdf_expand = blastdf
 | 
| 
 | 
   420   blastdf_expand$qstart = blastdf$qstart - (blastdf$qstart - 1)
 | 
| 
 | 
   421   blastdf_expand$sstart = blastdf$sstart - (blastdf$qstart - 1)
 | 
| 
 | 
   422   blastdf_expand$qend = blastdf$qend + (blastdf$qlen - blastdf$qend)
 | 
| 
 | 
   423   blastdf_expand$send = blastdf$send + (blastdf$qlen - blastdf$qend)
 | 
| 
 | 
   424   pS = blastdf$sstart
 | 
| 
 | 
   425   pE = blastdf$send
 | 
| 
 | 
   426   pSF = blastdf_expand$sstart
 | 
| 
 | 
   427   pEF = blastdf_expand$send
 | 
| 
 | 
   428   cond1 = pS - pSF >= min_offset & blastdf$qend >= (blastdf$qlen - max_end_dist) & 
 | 
| 
 | 
   429     blastdf$sstart >= max_pbs_search
 | 
| 
 | 
   430   cond2 = pEF - pE >= min_offset & blastdf$qstart <= max_end_dist & blastdf$send <= 
 | 
| 
 | 
   431     blastdf$slen - max_pbs_search
 | 
| 
 | 
   432   
 | 
| 
 | 
   433   ## coverage of alignments: evaluate coverage at site with breaks, it is neccessary
 | 
| 
 | 
   434   ## that there must be at least 20% of interupted alignments at given position to
 | 
| 
 | 
   435   ## be considered for additional search
 | 
| 
 | 
   436   coverage_profiles = subject_coverage(blastdf)
 | 
| 
 | 
   437   
 | 
| 
 | 
   438   
 | 
| 
 | 
   439   ## extract flanking sequences - cca 50nt, it should contain tRNA sequence search
 | 
| 
 | 
   440   ## Left
 | 
| 
 | 
   441   fin = tempfile()
 | 
| 
 | 
   442   fout = tempfile()
 | 
| 
 | 
   443   scoreL = scoreR = 0
 | 
| 
 | 
   444   
 | 
| 
 | 
   445   
 | 
| 
 | 
   446   ## left side unique insertion sites
 | 
| 
 | 
   447   if (any(cond1)) {
 | 
| 
 | 
   448     insertion_sites = ddply(blastdf[cond1, ], .(sseqid, sstart), nrow)
 | 
| 
 | 
   449     ## check coverage
 | 
| 
 | 
   450     insertion_sites$coverage = 0
 | 
| 
 | 
   451     for (i in 1:nrow(insertion_sites)) {
 | 
| 
 | 
   452       insertion_sites$coverage[i] = coverage_profiles[[insertion_sites$sseqid[i]]][insertion_sites$sstart[[i]]]
 | 
| 
 | 
   453     }
 | 
| 
 | 
   454     insertion_OK_left <- insertion_sites[with(insertion_sites, V1/coverage > 
 | 
| 
 | 
   455                                                                insertion_proportion_threshold), ]
 | 
| 
 | 
   456     
 | 
| 
 | 
   457     if (nrow(insertion_OK_left) > 0) {
 | 
| 
 | 
   458       s = ifelse(insertion_OK_left[, "sstart"] - max_pbs_search < 1, 1, insertion_OK_left[, 
 | 
| 
 | 
   459                                                                                           "sstart"] - max_pbs_search)
 | 
| 
 | 
   460       Lpart = subseq(dimers_seq[match(insertion_OK_left$sseqid, names(dimers_seq))], 
 | 
| 
 | 
   461                      s, insertion_OK_left$sstart)
 | 
| 
 | 
   462       
 | 
| 
 | 
   463       ## names are CONSENSUSID__READID_position
 | 
| 
 | 
   464       names(Lpart) = paste0(names(Lpart), "__", insertion_OK_left$V1, "__", 
 | 
| 
 | 
   465                             insertion_OK_left$sstart)
 | 
| 
 | 
   466       ## check presence TG dinucleotide
 | 
| 
 | 
   467       TG = vcountPattern("TG", subseq(dimers_seq[match(insertion_OK_left$sseqid, 
 | 
| 
 | 
   468                                                        names(dimers_seq))], insertion_OK_left$sstart - 2, insertion_OK_left$sstart + 
 | 
| 
 | 
   469                                                                                                           2))
 | 
| 
 | 
   470       if (any(TG > 0)) {
 | 
| 
 | 
   471         writeXStringSet(Lpart[TG > 0], filepath = fin)
 | 
| 
 | 
   472         cmd = paste("blastn -outfmt '6", paste(columns, collapse = " "), 
 | 
| 
 | 
   473                     "' -perc_identity 83 -query ", fin, "-db", tRNA_database_path, 
 | 
| 
 | 
   474                     "-word_size", 7, "-strand plus -dust no -max_target_seqs 10000 -evalue 100", 
 | 
| 
 | 
   475                     "-out", fout)
 | 
| 
 | 
   476         
 | 
| 
 | 
   477         system(cmd, ignore.stdout = TRUE)
 | 
| 
 | 
   478         df = read.table(fout, as.is = TRUE, col.names = columns, comment.char = "")
 | 
| 
 | 
   479         filter1 = df$length >= thr_length
 | 
| 
 | 
   480         filter2 = ifelse(df$send > df$sstart, df$send, df$sstart) >= end_position
 | 
| 
 | 
   481         df_pass = df[filter1 & filter2, , drop = FALSE]
 | 
| 
 | 
   482         df_pass_L = df_pass[!duplicated(df_pass$qseqid), , drop = FALSE]
 | 
| 
 | 
   483         scoreL = get_score(df_pass_L)
 | 
| 
 | 
   484         write.table(df_pass_L, file = paste0(output, "_L.csv"), row.names = FALSE, 
 | 
| 
 | 
   485                     sep = "\t")
 | 
| 
 | 
   486       }
 | 
| 
 | 
   487     }
 | 
| 
 | 
   488   }
 | 
| 
 | 
   489   if (any(cond2)) {
 | 
| 
 | 
   490     ## search Right
 | 
| 
 | 
   491     insertion_sites = ddply(blastdf[cond2, ], .(sseqid, send, slen), nrow)
 | 
| 
 | 
   492     ## check coverage
 | 
| 
 | 
   493     insertion_sites$coverage = 0
 | 
| 
 | 
   494     for (i in 1:nrow(insertion_sites)) {
 | 
| 
 | 
   495       insertion_sites$coverage[i] = coverage_profiles[[insertion_sites$sseqid[i]]][insertion_sites$send[[i]]]
 | 
| 
 | 
   496     }
 | 
| 
 | 
   497     insertion_OK_right <- insertion_sites[with(insertion_sites, V1/coverage > 
 | 
| 
 | 
   498                                                                 insertion_proportion_threshold), ]
 | 
| 
 | 
   499     
 | 
| 
 | 
   500     if (nrow(insertion_OK_right) > 0) {
 | 
| 
 | 
   501       s = ifelse(insertion_OK_right$send + max_pbs_search > insertion_OK_right$slen, 
 | 
| 
 | 
   502                  insertion_OK_right$slen, insertion_OK_right$send + max_pbs_search)
 | 
| 
 | 
   503       Rpart = subseq(dimers_seq[match(insertion_OK_right$sseqid, names(dimers_seq))], 
 | 
| 
 | 
   504                      insertion_OK_right$send, s)
 | 
| 
 | 
   505       names(Rpart) = paste0(names(Rpart), "__", insertion_OK_right$V1, "__", 
 | 
| 
 | 
   506                             insertion_OK_right$send)
 | 
| 
 | 
   507       
 | 
| 
 | 
   508       ## check presence CA dinucleotide
 | 
| 
 | 
   509       CA = vcountPattern("CA", subseq(dimers_seq[match(insertion_OK_right$sseqid, 
 | 
| 
 | 
   510                                                        names(dimers_seq))], insertion_OK_right$send - 2, insertion_OK_right$send + 
 | 
| 
 | 
   511                                                                                                          2, ))
 | 
| 
 | 
   512       if (any(CA > 0)) {
 | 
| 
 | 
   513         writeXStringSet(Rpart[CA > 0], filepath = fin)
 | 
| 
 | 
   514         cmd = paste("blastn -outfmt '6", paste(columns, collapse = " "), 
 | 
| 
 | 
   515                     "' -perc_identity 83 -query ", fin, "-db", tRNA_database_path, 
 | 
| 
 | 
   516                     "-word_size", 7, "-strand minus -dust no -max_target_seqs 10000 -evalue 100", 
 | 
| 
 | 
   517                     "-out", fout)
 | 
| 
 | 
   518         
 | 
| 
 | 
   519         system(cmd, ignore.stdout = TRUE)
 | 
| 
 | 
   520         df = read.table(fout, as.is = TRUE, col.names = columns, comment.char = "")
 | 
| 
 | 
   521         filter1 = df$length >= thr_length
 | 
| 
 | 
   522         filter2 = ifelse(df$send > df$sstart, df$send, df$sstart) >= end_position
 | 
| 
 | 
   523         df_pass = df[filter1 & filter2, , drop = FALSE]
 | 
| 
 | 
   524         df_pass_R = df_pass[!duplicated(df_pass$qseqid), , drop = FALSE]
 | 
| 
 | 
   525         write.table(df_pass_R, file = paste0(output, "_R.csv"), row.names = FALSE, 
 | 
| 
 | 
   526                     sep = "\t")
 | 
| 
 | 
   527         scoreR = get_score(df_pass_R)
 | 
| 
 | 
   528       }
 | 
| 
 | 
   529     }
 | 
| 
 | 
   530   }
 | 
| 
 | 
   531   unlink(fin)
 | 
| 
 | 
   532   unlink(fout)
 | 
| 
 | 
   533   return(max(scoreL, scoreR))
 | 
| 
 | 
   534 }
 | 
| 
 | 
   535 
 | 
| 
 | 
   536 
 | 
| 
 | 
   537 subject_coverage = function(blastdf) {
 | 
| 
 | 
   538   ## calculate coverage for all blast subjects
 | 
| 
 | 
   539   coverage_profiles <- by(blastdf, INDICES = blastdf$sseqid, FUN = function(x) {
 | 
| 
 | 
   540     as.numeric(coverage(IRanges(start = x$sstart, end = x$send)))
 | 
| 
 | 
   541   })
 | 
| 
 | 
   542   return(coverage_profiles)
 | 
| 
 | 
   543   
 | 
| 
 | 
   544 }
 | 
| 
 | 
   545 
 | 
| 
 | 
   546 get_score = function(x) {
 | 
| 
 | 
   547   ## keep best tRNA
 | 
| 
 | 
   548   if (nrow(x) == 0) {
 | 
| 
 | 
   549     return(0)
 | 
| 
 | 
   550   }
 | 
| 
 | 
   551   xm = data.frame(do.call(rbind, strsplit(x$qseqid, "__")), stringsAsFactors = FALSE)
 | 
| 
 | 
   552   xm$score = as.numeric(sapply(strsplit(xm[, 1], "_"), "[[", 4))
 | 
| 
 | 
   553   xm$AA = gsub("-.+$", "", gsub("^.+__", "", x$sseqid))
 | 
| 
 | 
   554   best_score = max(by(xm$score, INDICES = xm$AA, FUN = sum))
 | 
| 
 | 
   555   return(best_score)
 | 
| 
 | 
   556 }
 | 
| 
 | 
   557 
 | 
| 
 | 
   558 
 | 
| 
 | 
   559 
 | 
| 
 | 
   560 dotter = function(seq1, seq2 = NULL, params = "") {
 | 
| 
 | 
   561   if (is.null(seq2)) {
 | 
| 
 | 
   562     seq2 = seq1
 | 
| 
 | 
   563   }
 | 
| 
 | 
   564   library(Biostrings)
 | 
| 
 | 
   565   if (class(seq1) != "DNAStringSet") {
 | 
| 
 | 
   566     seq1 = BStringSet(seq1)
 | 
| 
 | 
   567   }
 | 
| 
 | 
   568   if (class(seq2) != "DNAStringSet") {
 | 
| 
 | 
   569     seq2 = BStringSet(seq2)
 | 
| 
 | 
   570   }
 | 
| 
 | 
   571   sf1 = tempfile("seq1")
 | 
| 
 | 
   572   writeXStringSet(seq1, file = sf1)
 | 
| 
 | 
   573   sf2 = tempfile("seq2")
 | 
| 
 | 
   574   writeXStringSet(seq2, file = sf2)
 | 
| 
 | 
   575   system(paste("dotter", params, sf1, sf2), wait = FALSE)
 | 
| 
 | 
   576   Sys.sleep(2)
 | 
| 
 | 
   577   unlink(c(sf1, sf2))
 | 
| 
 | 
   578   return(NULL)
 | 
| 
 | 
   579 }
 | 
| 
 | 
   580 
 | 
| 
 | 
   581 
 | 
| 
 | 
   582 
 | 
| 
 | 
   583 dotter2 = function(seq1, seq2 = NULL, params = NULL) {
 | 
| 
 | 
   584   if (is.null(seq2)) {
 | 
| 
 | 
   585     seq2 = seq1
 | 
| 
 | 
   586   }
 | 
| 
 | 
   587   if (is.null(params)) {
 | 
| 
 | 
   588     params = " -windowsize 30 -threshold 45 "
 | 
| 
 | 
   589   }
 | 
| 
 | 
   590   library(Biostrings)
 | 
| 
 | 
   591   if (class(seq1) != "DNAStringSet") {
 | 
| 
 | 
   592     seq1 = DNAStringSet(seq1)
 | 
| 
 | 
   593   }
 | 
| 
 | 
   594   if (class(seq2) != "DNAStringSet") {
 | 
| 
 | 
   595     seq2 = DNAStringSet(seq2)
 | 
| 
 | 
   596   }
 | 
| 
 | 
   597   L1 = nchar(seq1)
 | 
| 
 | 
   598   L2 = nchar(seq2)
 | 
| 
 | 
   599   
 | 
| 
 | 
   600   tmpdat1 = tempfile()
 | 
| 
 | 
   601   
 | 
| 
 | 
   602   dir.create(tmpdat1)
 | 
| 
 | 
   603   tmpdat2 = tempfile()
 | 
| 
 | 
   604   dir.create(tmpdat2)
 | 
| 
 | 
   605   oridir = getwd()
 | 
| 
 | 
   606   
 | 
| 
 | 
   607   
 | 
| 
 | 
   608   seq1_merged = DNAStringSet(paste(seq1, collapse = ""))
 | 
| 
 | 
   609   seq2_merged = DNAStringSet(paste(seq2, collapse = ""))
 | 
| 
 | 
   610   seq2rc_merged = reverseComplement(DNAStringSet(paste(seq2, collapse = "")))
 | 
| 
 | 
   611   
 | 
| 
 | 
   612   
 | 
| 
 | 
   613   sf1 = tempfile("seq1")
 | 
| 
 | 
   614   writeXStringSet(seq1_merged, filepath = sf1)
 | 
| 
 | 
   615   sf2 = tempfile("seq2")
 | 
| 
 | 
   616   sf2rc = tempfile("seq2rc")
 | 
| 
 | 
   617   writeXStringSet(seq2_merged, filepath = sf2)
 | 
| 
 | 
   618   writeXStringSet(seq2rc_merged, filepath = sf2rc)
 | 
| 
 | 
   619   
 | 
| 
 | 
   620   cmd1 = paste("dotmatcher -graph data -asequence ", sf1, "-bsequence", sf2, params)
 | 
| 
 | 
   621   cmd2 = paste("dotmatcher -graph data -asequence ", sf1, "-bsequence", sf2rc, 
 | 
| 
 | 
   622                params)
 | 
| 
 | 
   623   setwd(tmpdat1)
 | 
| 
 | 
   624   output1 = system(cmd1, intern = TRUE)
 | 
| 
 | 
   625   setwd(tmpdat2)
 | 
| 
 | 
   626   output2 = system(cmd2, intern = TRUE)
 | 
| 
 | 
   627   setwd(oridir)
 | 
| 
 | 
   628   
 | 
| 
 | 
   629   
 | 
| 
 | 
   630   
 | 
| 
 | 
   631   fout1 = strsplit(tail(output1, n = 1), split = " ")[[1]][2]
 | 
| 
 | 
   632   rawdat1 = readLines(paste(tmpdat1, "/", fout1, sep = ""))
 | 
| 
 | 
   633   rawdat1 = rawdat1[1:min(grep("^Rectangle", rawdat1))]
 | 
| 
 | 
   634   
 | 
| 
 | 
   635   if (length(rawdat1[grep("^Line", rawdat1)]) == 0) {
 | 
| 
 | 
   636     coord1 = NULL
 | 
| 
 | 
   637   } else {
 | 
| 
 | 
   638     coord1 = apply(sapply(strsplit(rawdat1[grep("^Line", rawdat1)], " "), "[", 
 | 
| 
 | 
   639                           c(3, 5, 7, 9, 11)), 1, as.numeric)
 | 
| 
 | 
   640     coord1 = matrix(coord1, ncol = 5)
 | 
| 
 | 
   641   }
 | 
| 
 | 
   642   
 | 
| 
 | 
   643   fout2 = strsplit(tail(output2, n = 1), split = " ")[[1]][2]
 | 
| 
 | 
   644   rawdat2 = readLines(paste(tmpdat2, "/", fout2, sep = ""))
 | 
| 
 | 
   645   rawdat2 = rawdat2[1:min(grep("^Rectangle", rawdat2))]
 | 
| 
 | 
   646   
 | 
| 
 | 
   647   if (length(rawdat2[grep("^Line", rawdat2)]) == 0) {
 | 
| 
 | 
   648     coord2 = NULL
 | 
| 
 | 
   649   } else {
 | 
| 
 | 
   650     coord2 = apply(sapply(strsplit(rawdat2[grep("^Line", rawdat2)], " "), "[", 
 | 
| 
 | 
   651                           c(3, 5, 7, 9, 11)), 1, as.numeric)
 | 
| 
 | 
   652     coord2 = matrix(coord2, ncol = 5)
 | 
| 
 | 
   653   }
 | 
| 
 | 
   654   unlink(sf1)
 | 
| 
 | 
   655   unlink(sf2)
 | 
| 
 | 
   656   unlink(sf2rc)
 | 
| 
 | 
   657   unlink(tmpdat1, recursive = TRUE)
 | 
| 
 | 
   658   unlink(tmpdat2, recursive = TRUE)
 | 
| 
 | 
   659   
 | 
| 
 | 
   660   N1 = sum(nchar(seq1))
 | 
| 
 | 
   661   N2 = sum(nchar(seq2))
 | 
| 
 | 
   662   op = par(xaxs = "i", yaxs = "i", mar = c(5, 2, 6, 10), las = 1)
 | 
| 
 | 
   663   on.exit(par(op))
 | 
| 
 | 
   664   plot(c(1, N1), c(1, N2), type = "n", xlab = "", ylab = "", axes = FALSE)
 | 
| 
 | 
   665   if (!is.null(coord1)) {
 | 
| 
 | 
   666     segments(x0 = coord1[, 1], y0 = coord1[, 2], x1 = coord1[, 3], y1 = coord1[, 
 | 
| 
 | 
   667                                                                                4])
 | 
| 
 | 
   668   }
 | 
| 
 | 
   669   if (!is.null(coord2)) {
 | 
| 
 | 
   670     segments(x0 = coord2[, 1], y0 = N2 - coord2[, 2], x1 = coord2[, 3], y1 = N2 - 
 | 
| 
 | 
   671                                                                           coord2[, 4])
 | 
| 
 | 
   672   }
 | 
| 
 | 
   673   abline(v = c(0, cumsum(L1)), col = "green")
 | 
| 
 | 
   674   abline(h = c(0, cumsum(L2)), col = "green")
 | 
| 
 | 
   675   box()
 | 
| 
 | 
   676   axis(1, at = c(1, cumsum(L1))[-(length(L1) + 1)], labels = names(seq1), hadj = 0, 
 | 
| 
 | 
   677        cex.axis = 0.7)
 | 
| 
 | 
   678   axis(4, at = c(1, cumsum(L2))[-(length(L2) + 1)], labels = names(seq2), hadj = 0, 
 | 
| 
 | 
   679        cex.axis = 0.7)
 | 
| 
 | 
   680   invisible(list(coord1, coord2))
 | 
| 
 | 
   681 }
 | 
| 
 | 
   682 
 | 
| 
 | 
   683 
 | 
| 
 | 
   684 CM2sequence = function(x) {
 | 
| 
 | 
   685   bases = c(colnames(x), "N")
 | 
| 
 | 
   686   x = cbind(x, 0)
 | 
| 
 | 
   687   colnames(x) = bases
 | 
| 
 | 
   688   seqs = paste(bases[apply(x, 1, which.max)], collapse = "")
 | 
| 
 | 
   689   
 | 
| 
 | 
   690   return(seqs)
 | 
| 
 | 
   691 }
 | 
| 
 | 
   692 
 | 
| 
 | 
   693 ## in alingment calculate significance from kmers
 | 
| 
 | 
   694 calculate_consensus_matrix = function(aln, tr_paths, G) {
 | 
| 
 | 
   695   bases = c("A", "C", "G", "T", "-")
 | 
| 
 | 
   696   positions = lapply(strsplit(as.character(aln), split = ""), function(x) which(!x %in% 
 | 
| 
 | 
   697                                                                                 "-"))
 | 
| 
 | 
   698   base_matrix = do.call(rbind, strsplit(as.character(aln), split = ""))
 | 
| 
 | 
   699   weights = matrix(0, nrow = length(aln), ncol = nchar(aln))
 | 
| 
 | 
   700   kmer_rel_proportions = (V(G)$size/table(factor(unlist(tr_paths), levels = 1:vcount(G))))
 | 
| 
 | 
   701   for (i in seq_along(positions)) {
 | 
| 
 | 
   702     weights[i, positions[[i]]] = kmer_rel_proportions[tr_paths[[i]]]
 | 
| 
 | 
   703   }
 | 
| 
 | 
   704   names(kmer_rel_proportions) = V(G)$name
 | 
| 
 | 
   705   ## get weights for gaps by approximation
 | 
| 
 | 
   706   fitgaps = function(y) {
 | 
| 
 | 
   707     if (sum(y == 0) == 0) {
 | 
| 
 | 
   708       return(y)
 | 
| 
 | 
   709     } else {
 | 
| 
 | 
   710       y0 = rep(y[y != 0], 3)
 | 
| 
 | 
   711       x0 = which(rep(y, 3) != 0)
 | 
| 
 | 
   712       fitted = approx(x0, y0, xout = seq_along(rep(y, 3)), rule = 1)
 | 
| 
 | 
   713     }
 | 
| 
 | 
   714     return(fitted$y[-seq_along(y)][seq_along(y)])
 | 
| 
 | 
   715   }
 | 
| 
 | 
   716   weights_with_gaps = t(apply(weights, 1, fitgaps))
 | 
| 
 | 
   717   ## weights_with_gaps = get_gap_weights(weights, aln,G) ## this step take wey too
 | 
| 
 | 
   718   ## long time!!!-not so important TODO handle gaps differently - more effectively
 | 
| 
 | 
   719   
 | 
| 
 | 
   720   ## get consensus matrix
 | 
| 
 | 
   721   CM = sapply(1:nchar(aln[[1]]), function(i) {
 | 
| 
 | 
   722     sapply(split(weights_with_gaps[, i], factor(base_matrix[, i], levels = bases)), 
 | 
| 
 | 
   723            sum)
 | 
| 
 | 
   724   })
 | 
| 
 | 
   725   return(t(CM)[, 1:5])
 | 
| 
 | 
   726 }
 | 
| 
 | 
   727 
 | 
| 
 | 
   728 
 | 
| 
 | 
   729 get_gaps_from_alignment = function(aln) {
 | 
| 
 | 
   730   as.character(aln)
 | 
| 
 | 
   731   gaps_positions = unique(do.call(rbind, str_locate_all(as.character(aln), "-+")))
 | 
| 
 | 
   732   return(gaps_positions)
 | 
| 
 | 
   733 }
 | 
| 
 | 
   734 
 | 
| 
 | 
   735 plot_kmer_graph = function(G, L = NULL, vertex.size = NULL, ord = NULL, upto = NULL, 
 | 
| 
 | 
   736                            highlight = NULL) {
 | 
| 
 | 
   737   if (!is.null(G$L) & is.null(L)) {
 | 
| 
 | 
   738     L = G$L
 | 
| 
 | 
   739   }
 | 
| 
 | 
   740   if (is.null(L)) {
 | 
| 
 | 
   741     ## L=layout.kamada.kawai(G)
 | 
| 
 | 
   742     L = OGDFlayout(G)[[1]]
 | 
| 
 | 
   743   }
 | 
| 
 | 
   744   clr = kmers2color(V(G)$name)
 | 
| 
 | 
   745   if (!is.null(highlight)) {
 | 
| 
 | 
   746     clr[highlight] = "#00000080"
 | 
| 
 | 
   747   }
 | 
| 
 | 
   748   if (!is.null(ord)) {
 | 
| 
 | 
   749     clr[ord[1:upto]] = paste(clr[ord[1:upto]], "30", sep = "")
 | 
| 
 | 
   750   }
 | 
| 
 | 
   751   if (is.null(vertex.size)) {
 | 
| 
 | 
   752     vertex.size = rescale(V(G)$size, to = c(0.5, 6))
 | 
| 
 | 
   753     
 | 
| 
 | 
   754   }
 | 
| 
 | 
   755   plot(G, layout = L, vertex.label = "", vertex.size = vertex.size, edge.curved = FALSE, 
 | 
| 
 | 
   756        vertex.color = clr, vertex.frame.color = "#00000020", edge.width = 2, edge.arrow.mode = 1, 
 | 
| 
 | 
   757        edge.arrow.size = 0.2)
 | 
| 
 | 
   758   
 | 
| 
 | 
   759 }
 | 
| 
 | 
   760 
 | 
| 
 | 
   761 rglplot_kmer_graph = function(G, L = NULL, vertex.size = 4) {
 | 
| 
 | 
   762   if (is.null(L)) {
 | 
| 
 | 
   763     L = layout.kamada.kawai(G)
 | 
| 
 | 
   764   }
 | 
| 
 | 
   765   
 | 
| 
 | 
   766   rglplot(G, layout = L, vertex.label = "", vertex.size = vertex.size, edge.curved = FALSE, 
 | 
| 
 | 
   767           vertex.color = kmers2color(V(G)$name), edge.width = 2, edge.arrow.mode = 1, 
 | 
| 
 | 
   768           edge.arrow.size = 0.5)
 | 
| 
 | 
   769 }
 | 
| 
 | 
   770 
 | 
| 
 | 
   771 
 | 
| 
 | 
   772 
 | 
| 
 | 
   773 
 | 
| 
 | 
   774 mafft = function(seqs, params = "--auto --thread 1 ") {
 | 
| 
 | 
   775   if (length(seqs) < 2) {
 | 
| 
 | 
   776     return(seqs)
 | 
| 
 | 
   777   }
 | 
| 
 | 
   778   infile = tempfile()
 | 
| 
 | 
   779   if (class(seqs) == "character") {
 | 
| 
 | 
   780     seqs = DNAStringSet(seqs)
 | 
| 
 | 
   781   }
 | 
| 
 | 
   782   writeXStringSet(seqs, file = infile)
 | 
| 
 | 
   783   outfile = tempfile()
 | 
| 
 | 
   784   cmd = paste("mafft --quiet --nuc ", params, infile, "2> /dev/null > ", outfile)
 | 
| 
 | 
   785   system(cmd, intern = TRUE, ignore.stderr = FALSE)
 | 
| 
 | 
   786   aln = readDNAStringSet(outfile)
 | 
| 
 | 
   787   unlink(c(outfile, infile))
 | 
| 
 | 
   788   return(aln)
 | 
| 
 | 
   789 }
 | 
| 
 | 
   790 
 | 
| 
 | 
   791 mgblast = function(databaseSeq, querySeq) {
 | 
| 
 | 
   792   params = " -p 85 -W18 -UT -X40 -KT -JF -F \"m D\" -v100000000 -b100000000 -D4 -C 30 -D 30 "
 | 
| 
 | 
   793                                         # no dust filtering:
 | 
| 
 | 
   794   paramsDF = " -p 85 -W18 -UT -X40 -KT -JF -F \"m D\" -v100000000 -b100000000 -D4 -F F"
 | 
| 
 | 
   795   
 | 
| 
 | 
   796   database = tempfile()
 | 
| 
 | 
   797   query = tempfile()
 | 
| 
 | 
   798   output = tempfile()
 | 
| 
 | 
   799   if (class(databaseSeq) == "character") {
 | 
| 
 | 
   800     database = databaseSeq
 | 
| 
 | 
   801     do_not_delete_database = TRUE
 | 
| 
 | 
   802   } else {
 | 
| 
 | 
   803     writeXStringSet(databaseSeq, filepath = database, format = "fasta")
 | 
| 
 | 
   804     do_not_delete_database = FALSE
 | 
| 
 | 
   805   }
 | 
| 
 | 
   806   if (class(querySeq) == "character") {
 | 
| 
 | 
   807     query = querySeq
 | 
| 
 | 
   808     do_not_delete_query = TRUE
 | 
| 
 | 
   809   } else {
 | 
| 
 | 
   810     writeXStringSet(querySeq, filepath = query, format = "fasta")
 | 
| 
 | 
   811     do_not_delete_query = FALSE
 | 
| 
 | 
   812   }
 | 
| 
 | 
   813   ## create database:
 | 
| 
 | 
   814   cmd = paste("formatdb -i", database, "-p F")
 | 
| 
 | 
   815   system(cmd)
 | 
| 
 | 
   816   cmd = paste("mgblast", "-d", database, "-i", query, "-o", output, params)
 | 
| 
 | 
   817   system(cmd)
 | 
| 
 | 
   818   if (file.info(output)$size == 0) {
 | 
| 
 | 
   819                                         # no hist, try wthou dust masker
 | 
| 
 | 
   820     cmd = paste("mgblast", "-d", database, "-i", query, "-o", output, paramsDF)
 | 
| 
 | 
   821     system(cmd)
 | 
| 
 | 
   822   }
 | 
| 
 | 
   823   blastOut = read.table(output, sep = "\t", header = FALSE, as.is = TRUE, comment.char = "")
 | 
| 
 | 
   824   unlink(output)
 | 
| 
 | 
   825   if (!do_not_delete_query) {
 | 
| 
 | 
   826     unlink(query)
 | 
| 
 | 
   827   }
 | 
| 
 | 
   828   if (!do_not_delete_database) {
 | 
| 
 | 
   829     unlink(paste(database, "*", sep = ""))
 | 
| 
 | 
   830   }
 | 
| 
 | 
   831   colnames(blastOut) = c("query", "q.length", "q.start", "q.end", "subject", "s.length", 
 | 
| 
 | 
   832                          "s.start", "s.end", "pid", "weight", "e.value", "strand")
 | 
| 
 | 
   833   blastOut
 | 
| 
 | 
   834 }
 | 
| 
 | 
   835 
 | 
| 
 | 
   836 
 | 
| 
 | 
   837 estimate_sample_size = function(NV, NE, maxv, maxe) {
 | 
| 
 | 
   838   ## density
 | 
| 
 | 
   839   d = (2 * NE)/(NV * (NV - 1))
 | 
| 
 | 
   840   eEst = (maxv * (maxv - 1) * d)/2
 | 
| 
 | 
   841   nEst = (d + sqrt(d^2 + 8 * d * maxe))/(2 * d)
 | 
| 
 | 
   842   if (eEst >= maxe) {
 | 
| 
 | 
   843     N = round(nEst)
 | 
| 
 | 
   844     E = round((N * (N - 1) * d)/2)
 | 
| 
 | 
   845     
 | 
| 
 | 
   846   }
 | 
| 
 | 
   847   if (nEst >= maxv) {
 | 
| 
 | 
   848     N = maxv
 | 
| 
 | 
   849     E = round((N * (N - 1) * d)/2)
 | 
| 
 | 
   850     
 | 
| 
 | 
   851   }
 | 
| 
 | 
   852   return(N)
 | 
| 
 | 
   853 }
 | 
| 
 | 
   854 
 | 
| 
 | 
   855 
 | 
| 
 | 
   856 
 | 
| 
 | 
   857 
 | 
| 
 | 
   858 mgblast2graph = function(blastfile, seqfile, graph_destination, directed_graph_destination, 
 | 
| 
 | 
   859                          oriented_sequences, paired = TRUE, repex = FALSE, image_file = NULL, image_file_tmb = NULL, 
 | 
| 
 | 
   860                          include_layout = TRUE, pair_completeness = NULL, satellite_model_path = NULL, 
 | 
| 
 | 
   861                          maxv = 40000, maxe = 5e+08, seqfile_full = seqfile) {
 | 
| 
 | 
   862   cat("loading blast results\n")
 | 
| 
 | 
   863   if (repex) {
 | 
| 
 | 
   864     cln = c("query", "subject", "weight", "q.length", "q.start", "q.end", "s.length", "s.start", 
 | 
| 
 | 
   865             "s.end", "sign")
 | 
| 
 | 
   866     colcls = c("character", "character", "numeric", "numeric", "numeric", "numeric", 
 | 
| 
 | 
   867                "numeric", "numeric", "numeric", "NULL", "NULL", "character")
 | 
| 
 | 
   868     
 | 
| 
 | 
   869   } else {
 | 
| 
 | 
   870     cln = c("query", "q.length", "q.start", "q.end", "subject", "s.length", "s.start", 
 | 
| 
 | 
   871             "s.end","weight", "sign")
 | 
| 
 | 
   872     colcls = c("character", "numeric", "numeric", "numeric", "character", "numeric", 
 | 
| 
 | 
   873                "numeric", "numeric", "NULL", "numeric", "NULL", "character")
 | 
| 
 | 
   874   }
 | 
| 
 | 
   875   if (class(blastfile) == "data.frame") {
 | 
| 
 | 
   876     blastTable = blastfile
 | 
| 
 | 
   877     colnames(blastTable)[12] = "sign"
 | 
| 
 | 
   878   } else {
 | 
| 
 | 
   879     blastTable = read.table(blastfile, sep = "\t", as.is = TRUE, header = FALSE, 
 | 
| 
 | 
   880                             colClasses = colcls, comment.char = "")
 | 
| 
 | 
   881     colnames(blastTable) = cln
 | 
| 
 | 
   882   }
 | 
| 
 | 
   883   ## check for duplicates!
 | 
| 
 | 
   884   key = with(blastTable, ifelse(query > subject, paste(query, subject), paste(subject, 
 | 
| 
 | 
   885                                                                               query)))
 | 
| 
 | 
   886   if (any(duplicated(key))) {
 | 
| 
 | 
   887     blastTable = blastTable[!duplicated(key), ]
 | 
| 
 | 
   888   }
 | 
| 
 | 
   889   seqs = readDNAStringSet(seqfile)
 | 
| 
 | 
   890   ## calculate pair completeness for reads before sampling
 | 
| 
 | 
   891   if (is.null(pair_completeness)) {
 | 
| 
 | 
   892     if (paired) {
 | 
| 
 | 
   893       if (seqfile_full != seqfile) {
 | 
| 
 | 
   894         seqs_full = readDNAStringSet(seqfile_full)
 | 
| 
 | 
   895         pair_counts = tabulate(table(gsub(".$", "", names(seqs_full))))
 | 
| 
 | 
   896         rm(seqs_full)
 | 
| 
 | 
   897       } else {
 | 
| 
 | 
   898         pair_counts = tabulate(table(gsub(".$", "", names(seqs))))
 | 
| 
 | 
   899         
 | 
| 
 | 
   900       }
 | 
| 
 | 
   901       pair_completeness = 1 - pair_counts[1]/sum(pair_counts)
 | 
| 
 | 
   902     }else{
 | 
| 
 | 
   903       pair_completeness = 0
 | 
| 
 | 
   904     }
 | 
| 
 | 
   905   }
 | 
| 
 | 
   906   NV = length(seqs)  # vertices
 | 
| 
 | 
   907   NE = nrow(blastTable)  # nodes
 | 
| 
 | 
   908   if (maxv < NV | maxe < NE) {
 | 
| 
 | 
   909     ## Sample if graph is large
 | 
| 
 | 
   910     V_sample_size = estimate_sample_size(NV, NE, maxv, maxe)
 | 
| 
 | 
   911     seqs = sample(seqs, V_sample_size)
 | 
| 
 | 
   912     blastTable = blastTable[blastTable$query %in% names(seqs) & blastTable$subject %in% 
 | 
| 
 | 
   913                             names(seqs), ]
 | 
| 
 | 
   914   }
 | 
| 
 | 
   915   
 | 
| 
 | 
   916   blastTable$sign = ifelse(blastTable$sign == "+", 1, -1)
 | 
| 
 | 
   917   vnames = unique(c(blastTable$query, blastTable$subject))
 | 
| 
 | 
   918   vindex = seq_along(vnames)
 | 
| 
 | 
   919   names(vindex) = vnames
 | 
| 
 | 
   920   ## correct orientation
 | 
| 
 | 
   921   cat("creating graph\n")
 | 
| 
 | 
   922   G = graph.data.frame(blastTable[, c("query", "subject", "weight", "sign")], directed = FALSE, 
 | 
| 
 | 
   923                        vertices = vnames)
 | 
| 
 | 
   924   if (include_layout) {
 | 
| 
 | 
   925     ## save temporarily modified blastTable if large
 | 
| 
 | 
   926     tmpB = tempfile()
 | 
| 
 | 
   927     save(blastTable, file = tmpB)
 | 
| 
 | 
   928     rm(blastTable)
 | 
| 
 | 
   929     ## 
 | 
| 
 | 
   930     cat("graph layout calculation ")
 | 
| 
 | 
   931     if (ecount(G) > 2e+06) {
 | 
| 
 | 
   932       cat("using fruchterman reingold\n")
 | 
| 
 | 
   933       L = layout.fruchterman.reingold(G, dim = 3)
 | 
| 
 | 
   934     } else {
 | 
| 
 | 
   935       cat("using OGDF & frucherman reingold\n")
 | 
| 
 | 
   936       Ltmp = OGDFlayout(G, alg = c("fmmm"))
 | 
| 
 | 
   937       L = cbind(Ltmp[[1]][, 1:2], layout.fruchterman.reingold(G, dim = 2))
 | 
| 
 | 
   938       
 | 
| 
 | 
   939     }
 | 
| 
 | 
   940     
 | 
| 
 | 
   941     GL = list(G = G, L = L)
 | 
| 
 | 
   942     save(GL, file = graph_destination)
 | 
| 
 | 
   943     if (!is.null(image_file)) {
 | 
| 
 | 
   944       cat("exporting graph figure")
 | 
| 
 | 
   945       png(image_file, width = 900, height = 900, pointsize = 20)
 | 
| 
 | 
   946       plot(GL$G, layout = GL$L[, 1:2], vertex.size = 2, vertex.color = "#000000A0", 
 | 
| 
 | 
   947            edge.color = "#00000040", vertex.shape = "circle", edge.curved = FALSE, 
 | 
| 
 | 
   948            vertex.label = NA, vertex.frame.color = NA, edge.width = 1)
 | 
| 
 | 
   949       dev.off()
 | 
| 
 | 
   950       ## create thunmbs:
 | 
| 
 | 
   951       system(paste("convert ", image_file, " -resize 100x100 ", image_file_tmb))
 | 
| 
 | 
   952       
 | 
| 
 | 
   953     }
 | 
| 
 | 
   954     rm(GL)
 | 
| 
 | 
   955     gc()
 | 
| 
 | 
   956     load(tmpB)
 | 
| 
 | 
   957     unlink(tmpB)
 | 
| 
 | 
   958   }
 | 
| 
 | 
   959   
 | 
| 
 | 
   960   if (!is.connected(G)) {
 | 
| 
 | 
   961     cat("Warning - graph is not connected\n")
 | 
| 
 | 
   962     cc = clusters(G, "weak")
 | 
| 
 | 
   963     mb = as.numeric(factor(membership(cc), levels = as.numeric(names(sort(table(membership(cc)), 
 | 
| 
 | 
   964                                                                           decreasing = TRUE)))))
 | 
| 
 | 
   965     selvi = which(mb == 1)  # largest component
 | 
| 
 | 
   966     cat("using largest component: \nsize =", round(length(selvi)/vcount(G) * 
 | 
| 
 | 
   967                                                    100, 1), "% ;", length(selvi), "reads", "\n")
 | 
| 
 | 
   968     G = induced.subgraph(G, vids = selvi)
 | 
| 
 | 
   969     blastTable = blastTable[blastTable$query %in% V(G)$name & blastTable$subject %in% 
 | 
| 
 | 
   970                                                       V(G)$name, ]
 | 
| 
 | 
   971   }
 | 
| 
 | 
   972   
 | 
| 
 | 
   973   Gmst = minimum.spanning.tree(G)
 | 
| 
 | 
   974                                         # create alternative trees - for case that unly suboptima solution is found
 | 
| 
 | 
   975   set.seed(123)
 | 
| 
 | 
   976   Gmst_alt = list()
 | 
| 
 | 
   977   Gmst_alt[[1]] = Gmst
 | 
| 
 | 
   978   for (i in 2:6){
 | 
| 
 | 
   979     E(G)$weight = runif(ecount(G), 0.1,1)
 | 
| 
 | 
   980     Gmst_alt[[i]] = minimum.spanning.tree(G)
 | 
| 
 | 
   981   }
 | 
| 
 | 
   982 
 | 
| 
 | 
   983   rm(G)
 | 
| 
 | 
   984   gc()
 | 
| 
 | 
   985   ## six attempts to reorient reads
 | 
| 
 | 
   986   flip_names_all = list()
 | 
| 
 | 
   987   prop_of_notfit=numeric()
 | 
| 
 | 
   988   thr_fit=0.001
 | 
| 
 | 
   989   for (ii in 1:6){
 | 
| 
 | 
   990     Gmst = Gmst_alt[[ii]]
 | 
| 
 | 
   991 
 | 
| 
 | 
   992 
 | 
| 
 | 
   993     blastTable_mst = as.data.frame(get.edgelist(Gmst, names = TRUE))
 | 
| 
 | 
   994     colnames(blastTable_mst) = c("query", "subject")
 | 
| 
 | 
   995     blastTable_mst$sign = E(Gmst)$sign
 | 
| 
 | 
   996 
 | 
| 
 | 
   997     d = dfs(Gmst, root = 1, father = TRUE, order.out = TRUE)
 | 
| 
 | 
   998     esign = E(Gmst)$sign
 | 
| 
 | 
   999     rc = rep(FALSE, vcount(Gmst))
 | 
| 
 | 
  1000     j = 0
 | 
| 
 | 
  1001     p = 0
 | 
| 
 | 
  1002     for (v in d$order[-1]) {
 | 
| 
 | 
  1003       j = j + 1
 | 
| 
 | 
  1004       f = as.numeric(d$father)[v]
 | 
| 
 | 
  1005       if (is.na(f)) {
 | 
| 
 | 
  1006         next
 | 
| 
 | 
  1007       }
 | 
| 
 | 
  1008       eid = get.edge.ids(Gmst, c(v, f))
 | 
| 
 | 
  1009       if (esign[eid] == -1) {
 | 
| 
 | 
  1010         ie = unique(c(which(blastTable_mst$query %in% V(Gmst)$name[v]), which(blastTable_mst$subject %in% 
 | 
| 
 | 
  1011                                                                               V(Gmst)$name[v])))  # incident edges
 | 
| 
 | 
  1012         esign[ie] = esign[ie] * -1
 | 
| 
 | 
  1013         rc[v] = TRUE
 | 
| 
 | 
  1014         p = p + 1
 | 
| 
 | 
  1015         if (p > 50) {
 | 
| 
 | 
  1016           p = 0
 | 
| 
 | 
  1017           cat("\r")
 | 
| 
 | 
  1018           cat(j, " : ", sum(esign)/length(esign))
 | 
| 
 | 
  1019         }
 | 
| 
 | 
  1020       }
 | 
| 
 | 
  1021     }
 | 
| 
 | 
  1022     cat("\t")
 | 
| 
 | 
  1023     flip_names_all[[ii]] = flip_names = V(Gmst)$name[rc]
 | 
| 
 | 
  1024 
 | 
| 
 | 
  1025     if (!exists("blastTable")) {
 | 
| 
 | 
  1026       load(tmpB)
 | 
| 
 | 
  1027       unlink(tmpB)
 | 
| 
 | 
  1028     }
 | 
| 
 | 
  1029     ## add nofit later!!
 | 
| 
 | 
  1030     s.mean = with(blastTable, (s.start + s.end)/2)
 | 
| 
 | 
  1031     s.mean_corrected = ifelse(blastTable$subject %in% flip_names, blastTable$s.length - 
 | 
| 
 | 
  1032                                                                   s.mean, s.mean)
 | 
| 
 | 
  1033     q.mean = with(blastTable, (q.start + q.end)/2)
 | 
| 
 | 
  1034     q.mean_corrected = ifelse(blastTable$query %in% flip_names, blastTable$q.length - 
 | 
| 
 | 
  1035                                                                 q.mean, q.mean)
 | 
| 
 | 
  1036     V1 = ifelse(s.mean_corrected > q.mean_corrected, blastTable$subject, blastTable$query)
 | 
| 
 | 
  1037     V2 = ifelse(s.mean_corrected > q.mean_corrected, blastTable$query, blastTable$subject)
 | 
| 
 | 
  1038     sign_final = ifelse(V1 %in% flip_names, -1, 1) * ifelse(V2 %in% flip_names, -1, 
 | 
| 
 | 
  1039                                                             1) * blastTable$sign
 | 
| 
 | 
  1040     nofit = unique(c(V1[sign_final == "-1"], V2[sign_final == "-1"]))
 | 
| 
 | 
  1041     prop_of_notfit[ii] = length(nofit)/vcount(Gmst)
 | 
| 
 | 
  1042     ## check if correctly oriented
 | 
| 
 | 
  1043     cat("prop notfit", prop_of_notfit[[ii]],"\n")
 | 
| 
 | 
  1044     if (prop_of_notfit[[ii]]<thr_fit){
 | 
| 
 | 
  1045       ## OK
 | 
| 
 | 
  1046       break
 | 
| 
 | 
  1047     }
 | 
| 
 | 
  1048   }
 | 
| 
 | 
  1049   if (!prop_of_notfit[[ii]]<thr_fit){
 | 
| 
 | 
  1050     ## get best solution
 | 
| 
 | 
  1051     ii_best = which.min(prop_of_notfit)
 | 
| 
 | 
  1052     if (ii != ii_best){   # if the last solution was not best, get the best
 | 
| 
 | 
  1053       flip_names = flip_names_all[[ii_best]]
 | 
| 
 | 
  1054       s.mean = with(blastTable, (s.start + s.end)/2)
 | 
| 
 | 
  1055       s.mean_corrected = ifelse(blastTable$subject %in% flip_names, blastTable$s.length - 
 | 
| 
 | 
  1056                                                                     s.mean, s.mean)
 | 
| 
 | 
  1057       q.mean = with(blastTable, (q.start + q.end)/2)
 | 
| 
 | 
  1058       q.mean_corrected = ifelse(blastTable$query %in% flip_names, blastTable$q.length - 
 | 
| 
 | 
  1059                                                                   q.mean, q.mean)
 | 
| 
 | 
  1060       V1 = ifelse(s.mean_corrected > q.mean_corrected, blastTable$subject, blastTable$query)
 | 
| 
 | 
  1061       V2 = ifelse(s.mean_corrected > q.mean_corrected, blastTable$query, blastTable$subject)
 | 
| 
 | 
  1062       sign_final = ifelse(V1 %in% flip_names, -1, 1) * ifelse(V2 %in% flip_names, -1, 
 | 
| 
 | 
  1063                                                               1) * blastTable$sign
 | 
| 
 | 
  1064       nofit = unique(c(V1[sign_final == "-1"], V2[sign_final == "-1"]))
 | 
| 
 | 
  1065       
 | 
| 
 | 
  1066     }
 | 
| 
 | 
  1067   }
 | 
| 
 | 
  1068 
 | 
| 
 | 
  1069   ## exclude all nofit
 | 
| 
 | 
  1070   
 | 
| 
 | 
  1071   df2 = data.frame(V1, V2, sign = blastTable$sign, sign_final = sign_final, stringsAsFactors = FALSE)
 | 
| 
 | 
  1072   rm(blastTable)
 | 
| 
 | 
  1073   gc()
 | 
| 
 | 
  1074   vertices = data.frame(name = vnames, reverse_complement = vnames %in% flip_names)
 | 
| 
 | 
  1075   G = graph.data.frame(df2, vertices = vertices)
 | 
| 
 | 
  1076   vcount_ori = vcount(G)
 | 
| 
 | 
  1077   G = induced.subgraph(G, vids = which(!V(G)$name %in% nofit))
 | 
| 
 | 
  1078   
 | 
| 
 | 
  1079   G$escore_mst = sum(esign)/length(esign)
 | 
| 
 | 
  1080   G$escore = sum(sign_final == 1)/length(sign_final)
 | 
| 
 | 
  1081   cc = clusters(G, "strong")
 | 
| 
 | 
  1082   mb = as.numeric(factor(membership(cc), levels = as.numeric(names(sort(table(membership(cc)), 
 | 
| 
 | 
  1083                                                                         decreasing = TRUE)))))
 | 
| 
 | 
  1084   names(mb) = V(G)$name
 | 
| 
 | 
  1085   G$loop_index = max(cc$csize)/vcount(G)
 | 
| 
 | 
  1086   G$coverage = vcount(G)/vcount_ori
 | 
| 
 | 
  1087   ## check sign in all edges
 | 
| 
 | 
  1088   V(G)$membership = mb
 | 
| 
 | 
  1089   save(G, file = directed_graph_destination)
 | 
| 
 | 
  1090   ## remove nofit
 | 
| 
 | 
  1091   seqs = seqs[!names(seqs) %in% nofit]
 | 
| 
 | 
  1092   seqs[names(seqs) %in% flip_names] = reverseComplement(seqs[names(seqs) %in% flip_names])
 | 
| 
 | 
  1093   names(seqs) = paste(names(seqs), mb[names(seqs)])
 | 
| 
 | 
  1094   writeXStringSet(seqs, filepath = oriented_sequences)
 | 
| 
 | 
  1095   ## calculate satellite probability
 | 
| 
 | 
  1096   if (is.null(satellite_model_path)) {
 | 
| 
 | 
  1097     pSAT = isSAT = NULL
 | 
| 
 | 
  1098   } else {
 | 
| 
 | 
  1099     satellite_model = readRDS(satellite_model_path)
 | 
| 
 | 
  1100     pSAT = get_prob(G$loop_index, pair_completeness, model = satellite_model)
 | 
| 
 | 
  1101     isSAT = isSatellite(G$loop_index, pair_completeness, model = satellite_model)
 | 
| 
 | 
  1102   }
 | 
| 
 | 
  1103   
 | 
| 
 | 
  1104   ## get larges cc
 | 
| 
 | 
  1105   output = list(escore = G$escore, coverage = G$coverage, escore_mts = G$escore_mst, 
 | 
| 
 | 
  1106                 loop_index = G$loop_index, pair_completeness = pair_completeness, graph_file = graph_destination, 
 | 
| 
 | 
  1107                 oriented_sequences = oriented_sequences, vcount = vcount(G), ecount = ecount(G), 
 | 
| 
 | 
  1108                 satellite_probability = pSAT, satellite = isSAT)
 | 
| 
 | 
  1109   ## clean up
 | 
| 
 | 
  1110   all_objects = ls()
 | 
| 
 | 
  1111   do_not_remove = "output"
 | 
| 
 | 
  1112   rm(list = all_objects[!(all_objects %in% do_not_remove)])
 | 
| 
 | 
  1113   gc(verbose = FALSE, reset = TRUE)
 | 
| 
 | 
  1114   return(list2dictionary(output))
 | 
| 
 | 
  1115 }
 | 
| 
 | 
  1116 
 | 
| 
 | 
  1117 list2dictionary = function(l){
 | 
| 
 | 
  1118     dict = "{"
 | 
| 
 | 
  1119     q='"'
 | 
| 
 | 
  1120     for (i in 1:length(l)){
 | 
| 
 | 
  1121         if (class(l[[i]])=="character" | is.null(l[[i]])){
 | 
| 
 | 
  1122             q2 = "'''"
 | 
| 
 | 
  1123         }else{
 | 
| 
 | 
  1124             q2 = ''
 | 
| 
 | 
  1125         }
 | 
| 
 | 
  1126         dict = paste0(
 | 
| 
 | 
  1127             dict,
 | 
| 
 | 
  1128             q,names(l)[i],q,":",q2, l[[i]], q2,", "
 | 
| 
 | 
  1129         )
 | 
| 
 | 
  1130     }
 | 
| 
 | 
  1131     dict = paste0(dict, "}")
 | 
| 
 | 
  1132     return(dict)
 | 
| 
 | 
  1133 }
 | 
| 
 | 
  1134 
 | 
| 
 | 
  1135 wrap = Vectorize(function(s, width = 80) {
 | 
| 
 | 
  1136   i1 = seq(1, nchar(s), width)
 | 
| 
 | 
  1137   i2 = seq(width, by = width, length.out = length(i1))
 | 
| 
 | 
  1138   return(paste(substring(s, i1, i2), collapse = "\n"))
 | 
| 
 | 
  1139 })
 | 
| 
 | 
  1140 
 | 
| 
 | 
  1141 
 | 
| 
 | 
  1142 tarean = function(input_sequences, output_dir, min_kmer_length = 11, max_kmer_length = 27, 
 | 
| 
 | 
  1143                   CPU = 2, sample_size = 10000, reorient_reads = TRUE, tRNA_database_path = NULL, 
 | 
| 
 | 
  1144                   include_layout = TRUE, paired = TRUE, lock_file=NULL) {
 | 
| 
 | 
  1145   options(CPU = CPU)
 | 
| 
 | 
  1146   time0 = Sys.time()
 | 
| 
 | 
  1147   dir.create(output_dir)
 | 
| 
 | 
  1148   input_sequences_copy = paste(output_dir, "/", basename(input_sequences), sep = "")
 | 
| 
 | 
  1149   
 | 
| 
 | 
  1150   if (!file.copy(input_sequences, input_sequences_copy, overwrite = TRUE)) {
 | 
| 
 | 
  1151     cat(paste("cannot copy", input_sequences, " to", output_dir), "\n")
 | 
| 
 | 
  1152     stop()
 | 
| 
 | 
  1153   }
 | 
| 
 | 
  1154   
 | 
| 
 | 
  1155   lock_file = waitForRAM(lock_file = lock_file)
 | 
| 
 | 
  1156   pair_completeness = NULL
 | 
| 
 | 
  1157   ## sampling
 | 
| 
 | 
  1158   if (sample_size != 0) {
 | 
| 
 | 
  1159     s = readDNAStringSet(input_sequences_copy)
 | 
| 
 | 
  1160     N = length(s)
 | 
| 
 | 
  1161     ## pair completness must be calculated before sampling!
 | 
| 
 | 
  1162     if (N > sample_size) {
 | 
| 
 | 
  1163       writeXStringSet(sample(s, sample_size), filepath = input_sequences_copy)
 | 
| 
 | 
  1164       if (paired) {
 | 
| 
 | 
  1165         pair_counts = tabulate(table(gsub(".$", "", names(s))))
 | 
| 
 | 
  1166         pair_completeness = 1 - pair_counts[1]/sum(pair_counts)
 | 
| 
 | 
  1167       }
 | 
| 
 | 
  1168     }
 | 
| 
 | 
  1169     rm(s)
 | 
| 
 | 
  1170   }
 | 
| 
 | 
  1171   
 | 
| 
 | 
  1172   if (reorient_reads) {
 | 
| 
 | 
  1173     input_sequences_oriented = paste0(input_sequences_copy, "oriented.fasta")
 | 
| 
 | 
  1174     graph_file = paste0(input_sequences_copy, "_graph.RData")
 | 
| 
 | 
  1175     GLfile = paste0(input_sequences_copy, "_graph.GL")
 | 
| 
 | 
  1176     cat("reorientig sequences\n")
 | 
| 
 | 
  1177     
 | 
| 
 | 
  1178     blastTable = mgblast(input_sequences_copy, input_sequences_copy)
 | 
| 
 | 
  1179     
 | 
| 
 | 
  1180     graph_info = mgblast2graph(blastTable, input_sequences_copy, GLfile, graph_file, 
 | 
| 
 | 
  1181                                input_sequences_oriented, include_layout = include_layout, paired = paired, 
 | 
| 
 | 
  1182                                pair_completeness = pair_completeness)
 | 
| 
 | 
  1183     
 | 
| 
 | 
  1184     ## interupt if it does not look like tandem repeat at all # soft threshold!
 | 
| 
 | 
  1185     if (is.null(graph_info$pair_completeness)) {
 | 
| 
 | 
  1186       if (graph_info$loop_index <= 0.4) {
 | 
| 
 | 
  1187         cat("CLUSTER DID NOT PASS THRESHOLD!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
 | 
| 
 | 
  1188         return(list2dictionary(list(graph_info = graph_info)))
 | 
| 
 | 
  1189       }
 | 
| 
 | 
  1190     } else {
 | 
| 
 | 
  1191       ## for paired reads:
 | 
| 
 | 
  1192       if (graph_info$loop_index < 0.7 | graph_info$pair_completeness < 0.4) {
 | 
| 
 | 
  1193         cat("CLUSTER DID NOT PASS THRESHOLD!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
 | 
| 
 | 
  1194         return(list2dictionary(list(graph_info = graph_info)))
 | 
| 
 | 
  1195       }
 | 
| 
 | 
  1196     }
 | 
| 
 | 
  1197     
 | 
| 
 | 
  1198   } else {
 | 
| 
 | 
  1199     input_sequences_oriented = input_sequences_copy
 | 
| 
 | 
  1200     graph_info = NULL
 | 
| 
 | 
  1201   }
 | 
| 
 | 
  1202   
 | 
| 
 | 
  1203   
 | 
| 
 | 
  1204   
 | 
| 
 | 
  1205   kmer_counts = list()
 | 
| 
 | 
  1206   kmer_lengths = seq(min_kmer_length, max_kmer_length, by = 4)
 | 
| 
 | 
  1207   for (i in kmer_lengths) {
 | 
| 
 | 
  1208     ## use pythonis.null(l[[i]])) function - faster
 | 
| 
 | 
  1209     cmd = paste(script.dir, "/kmer_counting.py ", input_sequences_oriented, " ", 
 | 
| 
 | 
  1210                 i, sep = "")
 | 
| 
 | 
  1211     cat("calculation kmers of length ", i, "\n")
 | 
| 
 | 
  1212     f = system(cmd, intern = TRUE)
 | 
| 
 | 
  1213     x = read.table(f, as.is = TRUE, sep = "\t")
 | 
| 
 | 
  1214     kmer_counts[[as.character(i)]] = data.frame(x, freq = x$V2/sum(x$V2))
 | 
| 
 | 
  1215   }
 | 
| 
 | 
  1216   
 | 
| 
 | 
  1217   ## number of kmers:
 | 
| 
 | 
  1218   N50 = sapply(kmer_counts, function(x) {
 | 
| 
 | 
  1219     sum(cumsum(x$freq) < 0.5)
 | 
| 
 | 
  1220   })
 | 
| 
 | 
  1221   
 | 
| 
 | 
  1222   N70 = sapply(kmer_counts, function(x) {
 | 
| 
 | 
  1223     sum(cumsum(x$freq) < 0.7)
 | 
| 
 | 
  1224   })
 | 
| 
 | 
  1225   
 | 
| 
 | 
  1226   
 | 
| 
 | 
  1227   time1 = Sys.time()
 | 
| 
 | 
  1228   ggmin = mclapply(kmer_counts, get_mimimal_cc, start = 0.5, mc.cores = CPU)
 | 
| 
 | 
  1229   time2 = Sys.time()
 | 
| 
 | 
  1230   cat("kmers graphs created ")
 | 
| 
 | 
  1231   print(time2 - time1)
 | 
| 
 | 
  1232   names(ggmin) = names(kmer_counts)
 | 
| 
 | 
  1233   
 | 
| 
 | 
  1234   
 | 
| 
 | 
  1235   ## estimate monomer
 | 
| 
 | 
  1236   monomers = mclapply(ggmin, estimate_monomer, mc.cores = CPU)
 | 
| 
 | 
  1237   
 | 
| 
 | 
  1238   names(monomers) = names(kmer_counts)
 | 
| 
 | 
  1239   monomers = monomers[!sapply(monomers, is.null)]
 | 
| 
 | 
  1240   ## error handling:
 | 
| 
 | 
  1241   error = sapply(monomers, class) == "try-error"
 | 
| 
 | 
  1242   if (any(error)) {
 | 
| 
 | 
  1243     cat("\nError in monomers estimation: ")
 | 
| 
 | 
  1244     cat("calculation failed for monomers length ", names(monomers)[error], "\n")
 | 
| 
 | 
  1245     print(monomers[error])
 | 
| 
 | 
  1246     if (any(!error)) {
 | 
| 
 | 
  1247       monomers = monomers[!error]
 | 
| 
 | 
  1248     } else {
 | 
| 
 | 
  1249       stop("monomer estimation failed")
 | 
| 
 | 
  1250     }
 | 
| 
 | 
  1251   }
 | 
| 
 | 
  1252   
 | 
| 
 | 
  1253   ## summary - make function!!
 | 
| 
 | 
  1254   total_score = list()
 | 
| 
 | 
  1255   k = 0
 | 
| 
 | 
  1256   length_best = numeric()
 | 
| 
 | 
  1257   score_bn = numeric()
 | 
| 
 | 
  1258   consensus = character()
 | 
| 
 | 
  1259   
 | 
| 
 | 
  1260   for (i in names(monomers)) {
 | 
| 
 | 
  1261     for (v in seq_along(monomers[[i]]$estimates)) {
 | 
| 
 | 
  1262       k = k + 1
 | 
| 
 | 
  1263       total_score[[k]] = c(kmer = as.numeric(i), variant = v, total_score = monomers[[i]]$estimates[[v]]$total_score)
 | 
| 
 | 
  1264       score_bn[k] = min(rowSums(monomers[[i]]$estimates[[v]]$CM))
 | 
| 
 | 
  1265       length_best[k] = monomers[[i]]$estimates[[v]]$length_variant_score[1, 
 | 
| 
 | 
  1266                                                                          1]
 | 
| 
 | 
  1267       consensus[[k]] = monomers[[i]]$estimates[[v]]$consensus2
 | 
| 
 | 
  1268     }
 | 
| 
 | 
  1269   }
 | 
| 
 | 
  1270   summary_table = as.data.frame(do.call(rbind, total_score))
 | 
| 
 | 
  1271   summary_table$monomer_length = length_best
 | 
| 
 | 
  1272   summary_table$consensus_length = nchar(consensus)
 | 
| 
 | 
  1273   summary_table$score_bn = score_bn
 | 
| 
 | 
  1274   summary_table$consensus = paste("<pre>", wrap(consensus, width = 80), "<pre>", 
 | 
| 
 | 
  1275                                   sep = "")
 | 
| 
 | 
  1276   consensus_DS = DNAStringSet(consensus)
 | 
| 
 | 
  1277   names(consensus_DS) = with(summary_table, paste0(kmer, "_", variant, "_sc_", 
 | 
| 
 | 
  1278                                                    signif(total_score), "_l_", monomer_length))
 | 
| 
 | 
  1279   
 | 
| 
 | 
  1280   ## filter by score - keep
 | 
| 
 | 
  1281   
 | 
| 
 | 
  1282   ## reorder by score
 | 
| 
 | 
  1283   consensus_DS = consensus_DS[order(summary_table$total_score, decreasing = TRUE)]
 | 
| 
 | 
  1284   summary_table = summary_table[order(summary_table$total_score, decreasing = TRUE), 
 | 
| 
 | 
  1285                                 ]
 | 
| 
 | 
  1286   rownames(summary_table) = NULL
 | 
| 
 | 
  1287   N = nrow(summary_table)
 | 
| 
 | 
  1288   ## concatenate concensus(ie. make dimer head 2 tail) for pbs detection and other
 | 
| 
 | 
  1289   ## make something like 'pseudo contig' multimer for mapping - min length 200 bp
 | 
| 
 | 
  1290 
 | 
| 
 | 
  1291   ## searches
 | 
| 
 | 
  1292   consensus_DS_dimer = DNAStringSet(paste0(consensus_DS, consensus_DS))
 | 
| 
 | 
  1293   tarean_contigs = DNAStringSet(sapply(consensus_DS,function(x)
 | 
| 
 | 
  1294     ifelse(nchar(x)<200,
 | 
| 
 | 
  1295            paste(rep(as.character(x),round(300/nchar(as.character(x))+1)),collapse=''),
 | 
| 
 | 
  1296            as.character(x)))
 | 
| 
 | 
  1297     )
 | 
| 
 | 
  1298 
 | 
| 
 | 
  1299   names(consensus_DS_dimer) = names(consensus_DS)
 | 
| 
 | 
  1300                                         # save sequences:
 | 
| 
 | 
  1301   consensus_DS_dimer_file = paste0(output_dir, "/consensus_dimer.fasta")
 | 
| 
 | 
  1302   consensus_DS_file = paste0(output_dir, "/consensus.fasta")
 | 
| 
 | 
  1303   tarean_contig_file = paste0(output_dir, "/tarean_contigs.fasta")
 | 
| 
 | 
  1304   writeXStringSet(consensus_DS, consensus_DS_file)
 | 
| 
 | 
  1305   writeXStringSet(tarean_contigs, tarean_contig_file)
 | 
| 
 | 
  1306   writeXStringSet(consensus_DS_dimer, consensus_DS_dimer_file)
 | 
| 
 | 
  1307   if (is.null(tRNA_database_path)) {
 | 
| 
 | 
  1308     pbs_score = -1
 | 
| 
 | 
  1309   } else {
 | 
| 
 | 
  1310     pbs_blast_table = paste0(output_dir, "/pbs_detection")
 | 
| 
 | 
  1311     pbs_score = detect_pbs(dimers_file = consensus_DS_dimer_file, tRNA_database_path = tRNA_database_path, 
 | 
| 
 | 
  1312                            reads = input_sequences_oriented, output = pbs_blast_table)
 | 
| 
 | 
  1313   }
 | 
| 
 | 
  1314   ## search of open reading frames get the length of the longest
 | 
| 
 | 
  1315   orf_l = max_ORF_length(consensus_DS_dimer)
 | 
| 
 | 
  1316   
 | 
| 
 | 
  1317   
 | 
| 
 | 
  1318   dir.create(paste0(output_dir, "/img"), recursive = TRUE)
 | 
| 
 | 
  1319   summary_table$monomer_length_graph = numeric(N)
 | 
| 
 | 
  1320   summary_table$graph_image = character(N)
 | 
| 
 | 
  1321   summary_table$monomer_length_logo = numeric(N)
 | 
| 
 | 
  1322   summary_table$logo_image = character(N)
 | 
| 
 | 
  1323   ## export graph nd consensus estimate to cluster directory type of output may
 | 
| 
 | 
  1324   ## change in future
 | 
| 
 | 
  1325   save(ggmin, file = paste0(output_dir, "/ggmin.RData"))
 | 
| 
 | 
  1326   save(monomers, file = paste0(output_dir, "/monomers.RData"))
 | 
| 
 | 
  1327   
 | 
| 
 | 
  1328   cat("generating HTML output\n")
 | 
| 
 | 
  1329   for (i in 1:N) {
 | 
| 
 | 
  1330     kmer = as.character(summary_table$kmer[i])
 | 
| 
 | 
  1331     variant = summary_table$variant[i]
 | 
| 
 | 
  1332     ## export graph
 | 
| 
 | 
  1333     fout_link = paste0("img/graph_", kmer, "mer_", variant, ".png")
 | 
| 
 | 
  1334     fout = paste0(output_dir, "/", fout_link)
 | 
| 
 | 
  1335     ## summary_table$monomer_length_graph[i] = summary_table$monomer_length[i]
 | 
| 
 | 
  1336     ## summary_table$monomer_length_logo[[i]] = nrow(monomers[[kmer]]$estimates[[variant]]$CM)
 | 
| 
 | 
  1337     summary_table$monomer_length[[i]] = length(monomers[[kmer]]$estimates[[variant]]$consensus)
 | 
| 
 | 
  1338 
 | 
| 
 | 
  1339     if (i <= 10) {
 | 
| 
 | 
  1340       png(fout, width = 800, height = 800)
 | 
| 
 | 
  1341       plot_kmer_graph(ggmin[[kmer]], highlight = unlist(monomers[[kmer]]$paths[[variant]]$tr_paths))
 | 
| 
 | 
  1342       dev.off()
 | 
| 
 | 
  1343       summary_table$graph_image[i] = hwriteImage(fout_link, link = fout_link, 
 | 
| 
 | 
  1344                                                  table = FALSE, width = 100, height = 100)
 | 
| 
 | 
  1345       ## export logo
 | 
| 
 | 
  1346       png_link = paste0("img/logo_", kmer, "mer_", variant, ".png")
 | 
| 
 | 
  1347       fout = paste0(output_dir, "/", png_link)
 | 
| 
 | 
  1348       png(fout, width = 1200, height = round(summary_table$monomer_length[i] * 
 | 
| 
 | 
  1349                                              1) + 550)
 | 
| 
 | 
  1350       try(plot_multiline_logo(monomers[[kmer]]$estimates[[variant]]$CM, W = 100))
 | 
| 
 | 
  1351       dev.off()
 | 
| 
 | 
  1352       ## export corresponding position probability matrices
 | 
| 
 | 
  1353       ppm_file = paste0(output_dir, '/ppm_', kmer, "mer_", variant, ".csv")
 | 
| 
 | 
  1354       ppm_link = paste0('ppm_', kmer, "mer_", variant, ".csv")
 | 
| 
 | 
  1355       write.table(monomers[[kmer]]$estimates[[variant]]$CM,
 | 
| 
 | 
  1356                   file = ppm_file,
 | 
| 
 | 
  1357                   col.names = TRUE, quote = FALSE,
 | 
| 
 | 
  1358                   row.names = FALSE, sep="\t")
 | 
| 
 | 
  1359       summary_table$logo_image[i] = hwriteImage(png_link, link = ppm_link, 
 | 
| 
 | 
  1360                                                 table = FALSE, width = 200, height = 100)
 | 
| 
 | 
  1361     }
 | 
| 
 | 
  1362     
 | 
| 
 | 
  1363   }
 | 
| 
 | 
  1364   
 | 
| 
 | 
  1365   ## html_report = HTMLReport()
 | 
| 
 | 
  1366   
 | 
| 
 | 
  1367   htmlfile = paste0(output_dir, "/report.html")
 | 
| 
 | 
  1368   cat(htmlheader, file = htmlfile)
 | 
| 
 | 
  1369   included_columns = c('kmer', 'variant', 'total_score', 'consensus_length','consensus', 'graph_image', 'logo_image')
 | 
| 
 | 
  1370   summary_table_clean = summary_table[,included_columns]
 | 
| 
 | 
  1371   colnames(summary_table_clean) = c('k-mer length',
 | 
| 
 | 
  1372                                     'Variant index',
 | 
| 
 | 
  1373                                     'k-mer coverage score',
 | 
| 
 | 
  1374                                     'Consensus length',
 | 
| 
 | 
  1375                                     'Consensus sequence',
 | 
| 
 | 
  1376                                     'k-mer based graph',
 | 
| 
 | 
  1377                                     'Sequence logo')
 | 
| 
 | 
  1378   HTML(summary_table_clean, file = htmlfile, sortableDF = TRUE)
 | 
| 
 | 
  1379   HTMLEndFile(file = htmlfile)
 | 
| 
 | 
  1380   time4 = Sys.time()
 | 
| 
 | 
  1381   print(time4 - time0)
 | 
| 
 | 
  1382   if (!is.null(lock_file)){
 | 
| 
 | 
  1383     print("------removing-lock--------")
 | 
| 
 | 
  1384     removelock(lock_file)
 | 
| 
 | 
  1385   }
 | 
| 
 | 
  1386 
 | 
| 
 | 
  1387   print(list(htmlfile = htmlfile, TR_score = summary_table$total_score[1],
 | 
| 
 | 
  1388              TR_monomer_length = as.numeric(summary_table$consensus_length[1]),
 | 
| 
 | 
  1389              TR_consensus = summary_table$consensus[1], pbs_score = pbs_score, graph_info = graph_info,
 | 
| 
 | 
  1390              orf_l = orf_l, tarean_contig_file = tarean_contig_file))
 | 
| 
 | 
  1391   return(list2dictionary(list(htmlfile = htmlfile, TR_score = summary_table$total_score[1],
 | 
| 
 | 
  1392               TR_monomer_length = as.numeric(summary_table$consensus_length[1]),
 | 
| 
 | 
  1393               TR_consensus = summary_table$consensus[1], pbs_score = pbs_score, graph_info = graph_info,
 | 
| 
 | 
  1394               orf_l = orf_l, tarean_contig_file = tarean_contig_file)))
 | 
| 
 | 
  1395 }
 | 
| 
 | 
  1396 
 | 
| 
 | 
  1397 
 | 
| 
 | 
  1398 ## graph loop index stability
 | 
| 
 | 
  1399 loop_index_instability = function(G) {
 | 
| 
 | 
  1400   N = 50
 | 
| 
 | 
  1401   s = seq(vcount(G), vcount(G)/10, length.out = N)
 | 
| 
 | 
  1402   p = seq(1, 0.1, length.out = N)
 | 
| 
 | 
  1403   li = numeric()
 | 
| 
 | 
  1404   for (i in seq_along(s)) {
 | 
| 
 | 
  1405     print(i)
 | 
| 
 | 
  1406     gs = induced_subgraph(G, sample(1:vcount(G), s[i]))
 | 
| 
 | 
  1407     li[i] = max(clusters(gs, "strong")$csize)/vcount(gs)
 | 
| 
 | 
  1408   }
 | 
| 
 | 
  1409   instability = lm(li ~ p)$coefficient[2]
 | 
| 
 | 
  1410   return(instability)
 | 
| 
 | 
  1411 }
 | 
| 
 | 
  1412 
 | 
| 
 | 
  1413 isSatellite = function(x, y, model) {
 | 
| 
 | 
  1414   p = get_prob(x, y, model)
 | 
| 
 | 
  1415   if (p > model$cutoff) {
 | 
| 
 | 
  1416     return("Putative Satellite")
 | 
| 
 | 
  1417   } else {
 | 
| 
 | 
  1418     return("")
 | 
| 
 | 
  1419   }
 | 
| 
 | 
  1420 }
 | 
| 
 | 
  1421 
 | 
| 
 | 
  1422 get_prob = function(x, y, model) {
 | 
| 
 | 
  1423   pm = model$prob_matrix
 | 
| 
 | 
  1424   N = ncol(pm)
 | 
| 
 | 
  1425   i = round(x * (N - 1)) + 1
 | 
| 
 | 
  1426   j = round(y * (N - 1)) + 1
 | 
| 
 | 
  1427   p = pm[i, j]
 | 
| 
 | 
  1428   return(p)
 | 
| 
 | 
  1429 }
 | 
| 
 | 
  1430 
 | 
| 
 | 
  1431 
 | 
| 
 | 
  1432 detectMemUsage = function() {
 | 
| 
 | 
  1433   con = textConnection(gsub(" +", " ", readLines("/proc/meminfo")))
 | 
| 
 | 
  1434   memInfo = read.table(con, fill = TRUE, row.names = 1)
 | 
| 
 | 
  1435   close(con)
 | 
| 
 | 
  1436   memUsage = 1 - (memInfo["MemFree", 1] + memInfo["Cached", 1])/memInfo["MemTotal", 
 | 
| 
 | 
  1437                                                                         1]
 | 
| 
 | 
  1438   return(memUsage)
 | 
| 
 | 
  1439 }
 | 
| 
 | 
  1440 
 | 
| 
 | 
  1441 
 | 
| 
 | 
  1442 makelock<-function(lockfile,lockmsg,CreateDirectories=TRUE){
 | 
| 
 | 
  1443     lockdir=dirname(lockfile)
 | 
| 
 | 
  1444     if(!file.exists(lockdir)){
 | 
| 
 | 
  1445         if(CreateDirectories) dir.create(lockdir,recursive=TRUE)
 | 
| 
 | 
  1446         else stop("Lock Directory for lockfile ",lockfile," does not exist")
 | 
| 
 | 
  1447     } 
 | 
| 
 | 
  1448     if(missing(lockmsg)) lockmsg=paste(system('hostname',intern=TRUE),Sys.getenv("R_SESSION_TMPDIR"))
 | 
| 
 | 
  1449     if (file.exists(lockfile)) return (FALSE)
 | 
| 
 | 
  1450                                         # note the use of paste makes the message writing atomic
 | 
| 
 | 
  1451     cat(paste(lockmsg,"\n",sep=""),file=lockfile,append=TRUE,sep="")
 | 
| 
 | 
  1452     firstline=readLines(lockfile,n=1)
 | 
| 
 | 
  1453     if(firstline!=lockmsg){
 | 
| 
 | 
  1454                                         # somebody else got there first
 | 
| 
 | 
  1455         return(FALSE)
 | 
| 
 | 
  1456     } else return(TRUE)
 | 
| 
 | 
  1457 }
 | 
| 
 | 
  1458 
 | 
| 
 | 
  1459 
 | 
| 
 | 
  1460 removelock<-function(lockfile){
 | 
| 
 | 
  1461   if(unlink(lockfile)!=0) {
 | 
| 
 | 
  1462     warning("Unable to remove ",lockfile)
 | 
| 
 | 
  1463     return (FALSE)
 | 
| 
 | 
  1464   }
 | 
| 
 | 
  1465   return (TRUE)
 | 
| 
 | 
  1466 }
 | 
| 
 | 
  1467 
 | 
| 
 | 
  1468 
 | 
| 
 | 
  1469 waitForRAM = function(p = 0.5,lock_file=NULL) {
 | 
| 
 | 
  1470   if (detectMemUsage() < p) {
 | 
| 
 | 
  1471     return(NULL)
 | 
| 
 | 
  1472     ## check lock file:
 | 
| 
 | 
  1473   } else {
 | 
| 
 | 
  1474     cat("waiting for RAM \n")
 | 
| 
 | 
  1475     free_count = 0
 | 
| 
 | 
  1476     while (TRUE) {
 | 
| 
 | 
  1477         if (makelock(lock_file)){
 | 
| 
 | 
  1478             print("---------locking--------")
 | 
| 
 | 
  1479             return(lock_file)
 | 
| 
 | 
  1480         }
 | 
| 
 | 
  1481       if (detectMemUsage() < p) {
 | 
| 
 | 
  1482         cat("RAM freed \n")
 | 
| 
 | 
  1483         return(NULL)
 | 
| 
 | 
  1484       }
 | 
| 
 | 
  1485       Sys.sleep(5)
 | 
| 
 | 
  1486       if (evaluate_user_cpu_usage() == 'free'){
 | 
| 
 | 
  1487         free_count = free_count + 1
 | 
| 
 | 
  1488       }else{
 | 
| 
 | 
  1489         free_count = 0
 | 
| 
 | 
  1490       }
 | 
| 
 | 
  1491       if (detectMemUsage() < 0.8 & free_count > 100){
 | 
| 
 | 
  1492         cat("RAM not free but nothing else is running \n")
 | 
| 
 | 
  1493         return(NULL)
 | 
| 
 | 
  1494       }
 | 
| 
 | 
  1495     }
 | 
| 
 | 
  1496   }
 | 
| 
 | 
  1497 }
 | 
| 
 | 
  1498 
 | 
| 
 | 
  1499 lsmem = function() {
 | 
| 
 | 
  1500   g = globalenv()
 | 
| 
 | 
  1501   out_all = envs = list()
 | 
| 
 | 
  1502   envs = append(envs, g)
 | 
| 
 | 
  1503   total_size = numeric()
 | 
| 
 | 
  1504   while (environmentName(g) != "R_EmptyEnv") {
 | 
| 
 | 
  1505     g <- parent.env(g)
 | 
| 
 | 
  1506     envs = append(envs, g)
 | 
| 
 | 
  1507   }
 | 
| 
 | 
  1508   for (e in envs) {
 | 
| 
 | 
  1509     
 | 
| 
 | 
  1510     obj = ls(envir = e)
 | 
| 
 | 
  1511     if (length(obj) == 0) {
 | 
| 
 | 
  1512       break
 | 
| 
 | 
  1513     }
 | 
| 
 | 
  1514     obj.size = list()
 | 
| 
 | 
  1515     for (i in obj) {
 | 
| 
 | 
  1516       obj.size[[i]] = object.size(get(i, envir = e))
 | 
| 
 | 
  1517     }
 | 
| 
 | 
  1518     out = data.frame(object = obj, size = unlist(obj.size), stringsAsFactors = FALSE)
 | 
| 
 | 
  1519     out = out[order(out$size, decreasing = TRUE), ]
 | 
| 
 | 
  1520     out_all = append(out_all, out)
 | 
| 
 | 
  1521     total_size = append(total_size, sum(out$size))
 | 
| 
 | 
  1522   }
 | 
| 
 | 
  1523   return(list(objects = out_all, total_size = total_size))
 | 
| 
 | 
  1524 } 
 | 
| 
 | 
  1525 
 | 
| 
 | 
  1526 evaluate_user_cpu_usage = function(){
 | 
| 
 | 
  1527   user = Sys.info()["user"]
 | 
| 
 | 
  1528   a = sum(as.numeric (system(paste ("ps -e -o %cpu -u", user), intern = TRUE)[-1]))
 | 
| 
 | 
  1529   s = substring (system(paste ("ps -e -o stat -u", user), intern = TRUE)[-1],1,1)
 | 
| 
 | 
  1530   if (a<5 & sum(s %in% 'D')==0 & sum(s%in% 'R')<2){
 | 
| 
 | 
  1531     status = 'free'
 | 
| 
 | 
  1532   }else{
 | 
| 
 | 
  1533     status = 'full'
 | 
| 
 | 
  1534   }
 | 
| 
 | 
  1535   return(status)
 | 
| 
 | 
  1536 }
 |