| 0 | 1 #!/usr/bin/env Rscript | 
|  | 2 Sys.setlocale("LC_CTYPE", "en_US.UTF-8")  # this is necessary for handling unicode characters (data.tree package) | 
|  | 3 suppressPackageStartupMessages(library(data.tree)) | 
|  | 4 suppressPackageStartupMessages(library(stringr)) | 
|  | 5 suppressPackageStartupMessages(library(R2HTML)) | 
|  | 6 suppressPackageStartupMessages(library(hwriter)) | 
|  | 7 suppressPackageStartupMessages(library(DT)) | 
|  | 8 suppressPackageStartupMessages(library(tools)) | 
|  | 9 suppressPackageStartupMessages(library(scales)) | 
|  | 10 suppressPackageStartupMessages(library(igraph)) | 
|  | 11 suppressPackageStartupMessages(library(plotrix)) | 
|  | 12 suppressPackageStartupMessages(library(png)) | 
|  | 13 | 
|  | 14 source("htmlheader.R") | 
|  | 15 source("config.R")  # load tandem ranks info | 
|  | 16 source("utils.R") | 
|  | 17 DT_OPTIONS = options = list(pageLength = 1000, lengthMenu = c(10,50,100,1000,5000,10000)) | 
|  | 18 WD = getwd()   # to get script directory when run from Rserve | 
|  | 19 HTMLHEADER = htmlheader  ## header (character) loaded from htmlheader.R | 
|  | 20 htmlheader = gsub("Superclusters summary","TAREAN summary", htmlheader) | 
|  | 21 | 
|  | 22 evaluate_LTR_detection = function(f){ | 
|  | 23 	NO_LTR=NULL | 
|  | 24 	if (length(readLines(f)) == 11 ){ | 
|  | 25 		return(NO_LTR) | 
|  | 26 	} | 
|  | 27 	df=read.table(f,as.is=TRUE,sep="\t", skip=11,fill=TRUE) | 
|  | 28 	if (ncol(df) != 23){ | 
|  | 29 		#df is smaller if no pbs is detected! | 
|  | 30 		return(NO_LTR) | 
|  | 31 	} | 
|  | 32   df=df[!df$V13 == "",,drop=FALSE] | 
|  | 33 	if (nrow(df)==0){ | 
|  | 34 		return(NO_LTR) | 
|  | 35 	} | 
|  | 36 	# criteria: | 
|  | 37 	df_part=df[df$V15 >=12 & df$V20 == 23 & df$V21<df$V20,,drop=FALSE] | 
|  | 38 	if (nrow(df_part) == 0){ | 
|  | 39 		return(NO_LTR) | 
|  | 40 	} | 
|  | 41   PBS_type = gsub("_","",(str_extract_all(df_part$V13, pattern="_([A-Z][a-z]{2})", simplify=TRUE))) %>% | 
|  | 42       paste(collapse=" ") | 
|  | 43 	return(PBS_type) | 
|  | 44 } | 
|  | 45 | 
|  | 46 | 
|  | 47 | 
|  | 48 ## annotate superclusters | 
|  | 49 select_reads_id = function(index, search_type = c("cluster","supercluster")) { | 
|  | 50     ## select read if base on the supecluster index need database connection | 
|  | 51     ## HITSORTDB! | 
|  | 52     search_type = match.arg(search_type) | 
|  | 53     x = dbGetQuery(HITSORTDB, | 
|  | 54                    paste0("SELECT vertexname FROM vertices WHERE vertexindex IN ", | 
|  | 55                           "(SELECT vertexindex  FROM communities ", | 
|  | 56                           "WHERE ", search_type,"=\"", index, | 
|  | 57                           "\")")) | 
|  | 58     return(x$vertexname) | 
|  | 59 } | 
|  | 60 | 
|  | 61 | 
|  | 62 get_reads_annotation = function(reads_id) { | 
|  | 63     ## select annotation from tables in SEQDB which has name in format *_database | 
|  | 64     annot_types = grep("_database", dbListTables(SEQDB), value = TRUE) | 
|  | 65     annot = list() | 
|  | 66     for (i in annot_types) { | 
|  | 67         query = paste0("SELECT * FROM ", i, " WHERE name IN (", paste0("\"", reads_id, | 
|  | 68             "\"", collapse = ", "), ")") | 
|  | 69         annot[[i]] = dbGetQuery(SEQDB, query) | 
|  | 70     } | 
|  | 71     return(annot) | 
|  | 72 } | 
|  | 73 | 
|  | 74 supercluster_size = function(supercluster) { | 
|  | 75     x = dbGetQuery(HITSORTDB, paste0("SELECT count(*) FROM vertices WHERE vertexindex IN ", | 
|  | 76         "(SELECT vertexindex  FROM communities ", "WHERE supercluster=\"", supercluster, | 
|  | 77         "\")")) | 
|  | 78     return(x$"count(*)") | 
|  | 79 } | 
|  | 80 | 
|  | 81 | 
|  | 82 cluster_annotation = function(cluster, search_type = c("cluster", "supercluster")){ | 
|  | 83     ## searcheither for cluster or supercluster annotation | 
|  | 84     ## read annotation from sqlite databases database is access though SEQDB (sequence | 
|  | 85     ## annotation) and HITSORTDB - clustering information | 
|  | 86     search_type = match.arg(search_type) | 
|  | 87     reads_id = select_reads_id(cluster, search_type) | 
|  | 88     annot = get_reads_annotation(reads_id) | 
|  | 89     return(annot) | 
|  | 90 } | 
|  | 91 | 
|  | 92 get_tarean_info = function(cluster, search_type = c("cluster", "supercluster")){ | 
|  | 93     search_type = match.arg(search_type) | 
|  | 94     if (search_type == "cluster") { | 
|  | 95         search_type = "[index]" | 
|  | 96     } | 
|  | 97     tarean_info = dbGetQuery(HITSORTDB, | 
|  | 98                          paste0( | 
|  | 99                              "SELECT [index], supercluster, satellite_probability, tandem_rank, size_real, satellite FROM cluster_info WHERE ", | 
|  | 100                              search_type, " = ", cluster)) | 
|  | 101     nhits = sum(tarean_info$size_real[tarean_info$tandem_rank %in% 1:2]) | 
|  | 102     proportion = nhits/sum(tarean_info$size_real) | 
|  | 103     return( list( | 
|  | 104         nhits = nhits, | 
|  | 105         proportion = proportion, | 
|  | 106         tarean_info = tarean_info) | 
|  | 107         ) | 
|  | 108 | 
|  | 109 } | 
|  | 110 | 
|  | 111 get_ltr_info = function(supercluster){ | 
|  | 112   ltr_info = dbGetQuery(HITSORTDB, | 
|  | 113                         paste0( | 
|  | 114                           "SELECT [index], supercluster, ltr_detection, size_real FROM cluster_info WHERE ", | 
|  | 115                           "supercluster", " = ", supercluster)) | 
|  | 116   return(ltr_info) | 
|  | 117 } | 
|  | 118 | 
|  | 119 summarize_annotation = function(annot, size = NULL){ | 
|  | 120     db_id = do.call(rbind, annot)$db_id | 
|  | 121     cl_string = gsub("^.+/","",gsub(":.+$", "", gsub("^.+#", "", db_id))) | 
|  | 122     weight = do.call(rbind, annot)$bitscore | 
|  | 123     domain = str_replace(str_extract(str_replace(db_id, "^.+#", ""), ":.+"), ":", | 
|  | 124                                                           "") | 
|  | 125     domain[is.na(domain)] = "" | 
|  | 126     domain_table = table(cl_string, domain) | 
|  | 127     dt = as.data.frame(domain_table) | 
|  | 128     ## remove no count | 
|  | 129     dt = dt[dt$Freq != 0, ,drop = FALSE] | 
|  | 130     rownames(dt) = paste(dt$cl_string,dt$domain) | 
|  | 131     if (!is.null(size)){ | 
|  | 132         dt$proportion = dt$Freq / size | 
|  | 133 | 
|  | 134     } | 
|  | 135     return(dt) | 
|  | 136 } | 
|  | 137 | 
|  | 138 annot2colors = function(annot, ids, base_color = "#00000050"){ | 
|  | 139     annot_table = do.call(rbind, annot) | 
|  | 140     domains = str_replace(str_extract(str_replace(annot_table$db_id, "^.+#", ""), ":.+"), ":","") | 
|  | 141     other_annot = str_replace(annot_table$db_id, "^.+#", "") | 
|  | 142     complete_annot = ifelse(is.na(domains), other_annot, domains) | 
|  | 143     names(complete_annot) = annot_table$name | 
|  | 144     unique_complete_annot = names (table(complete_annot) %>% sort(decreasing = TRUE)) | 
|  | 145     color_key = unique_colors[seq_along(unique_complete_annot)] | 
|  | 146     names(color_key) = unique_complete_annot | 
|  | 147     color_table = rep(base_color, length(ids)) | 
|  | 148     names(color_table) = ids | 
|  | 149     color_table[names(complete_annot)] = color_key[complete_annot] | 
|  | 150     return(list( color_table = color_table, legend = color_key)) | 
|  | 151 } | 
|  | 152 | 
|  | 153 | 
|  | 154  read_annotation_to_tree = function(supercluster, TREE_TEMPLATE = CLS_TREE) { | 
|  | 155      ## CLS_TREE is datatree containing 'taxonomy' information of repeats, attribute of | 
|  | 156      ## each node/ leave if filled up from annotation annotation is added to leaves | 
|  | 157      ## only!! Inner node are NA or NULL | 
|  | 158      annot0 = cluster_annotation(supercluster, search_type = "supercluster") | 
|  | 159      ## keep only built in annotation | 
|  | 160      annot = list(protein_databse = annot0$protein_database, dna_database = annot0$dna_database) | 
|  | 161      annot$dna_database$bitscore = annot$dna_database$bitscore / 2 #DNA bitscore corection, blastx - more weight | 
|  | 162      annot = do.call(rbind, annot) | 
|  | 163      ## remove duplicated hits, keep best | 
|  | 164      annot_clean = annot[order(annot$bitscore, decreasing = TRUE), ] | 
|  | 165      annot_clean = annot_clean[!duplicated(annot_clean$name), ] | 
|  | 166      tarean_info = get_tarean_info(supercluster, search_type = "supercluster") | 
|  | 167      ltr_info = get_ltr_info(supercluster) | 
|  | 168      db_id = annot_clean$db_id | 
|  | 169      cl_string = gsub(":.+$", "", gsub("^.+#", "", db_id)) | 
|  | 170      weight = annot_clean$bitscore | 
|  | 171      domain = str_replace(str_extract(str_replace(db_id, "^.+#", ""), ":.+"), ":", | 
|  | 172         "") | 
|  | 173      domain[is.na(domain)] = "NA" | 
|  | 174      mean_weight_table = by(weight, INDICES = list(cl_string, domain), FUN = function(x) signif(mean(x))) | 
|  | 175      mean_weight_table[is.na(mean_weight_table)] = 0 | 
|  | 176      total_weight_table = by(weight, INDICES = list(cl_string, domain), FUN = sum) | 
|  | 177      total_weight_table[is.na(total_weight_table)] = 0 | 
|  | 178      cl_table = table(cl_string) | 
|  | 179      domain_table = table(cl_string, domain) | 
|  | 180      cls_tree = Clone(TREE_TEMPLATE) | 
|  | 181      cls_tree$size = supercluster_size(supercluster) | 
|  | 182      cls_tree$ltr_info = ltr_info | 
|  | 183      for (i in cls_tree$leaves) { | 
|  | 184          if (i$full_name %in% names(cl_table)) { | 
|  | 185              i$nhits = cl_table[[i$full_name]] | 
|  | 186              i$domains = domain_table[i$full_name, ] | 
|  | 187              names(i$domains) = colnames(domain_table) | 
|  | 188              i$mean_weight = mean_weight_table[i$full_name, ] | 
|  | 189              i$total_weight = total_weight_table[i$full_name, ] | 
|  | 190              i$proportion = signif(i$nhits/cls_tree$size, 3) | 
|  | 191          } else { | 
|  | 192              if (i$name == "satellite"){ | 
|  | 193                i$nhits = tarean_info$nhits | 
|  | 194                  i$tandem_rank = tarean_info$tarean_info$tandem_rank | 
|  | 195                  i$proportion = tarean_info$proportion | 
|  | 196                  i$tarean_info = tarean_info$tarean_info | 
|  | 197              }else{ | 
|  | 198                  i$proportion = 0 | 
|  | 199              } | 
|  | 200          } | 
|  | 201      } | 
|  | 202      ## create domain string for printing: | 
|  | 203      for (i in Traverse(cls_tree)) { | 
|  | 204          if (is.null(i$domains)) { | 
|  | 205              i$features = "" | 
|  | 206          } else if (sum(i$domains) == 0) { | 
|  | 207              i$features = "" | 
|  | 208          } else { | 
|  | 209              dom = i$domains[!names(i$domains) == "NA"] | 
|  | 210              i$features = gsub(" *\\(\\)", "", pasteDomains(dom)) | 
|  | 211          } | 
|  | 212      } | 
|  | 213      ## add LTR info: | 
|  | 214      if  (any(!cls_tree$ltr_info$ltr_detection %in% "None")){ | 
|  | 215        tr = FindNode(cls_tree, "LTR") | 
|  | 216        tr$features = "LTR/PBS" | 
|  | 217       } | 
|  | 218      return(cls_tree) | 
|  | 219  } | 
|  | 220 | 
|  | 221 pasteDomains = function(dom){ | 
|  | 222     d = dom[dom != 0] | 
|  | 223     if (length(d) == 0){ | 
|  | 224         return("") | 
|  | 225     }else{ | 
|  | 226         paste0(d, " (", names(d), ")", sep = ", ", collapse="") | 
|  | 227     } | 
|  | 228 | 
|  | 229 } | 
|  | 230 | 
|  | 231 rescale = function(x, newrange) { | 
|  | 232     # taken from plotrix package | 
|  | 233     if (missing(x) | missing(newrange)) { | 
|  | 234         usage.string <- paste("Usage: rescale(x,newrange)\n", "\twhere x is a numeric object and newrange is the new min and max\n", | 
|  | 235             sep = "", collapse = "") | 
|  | 236         stop(usage.string) | 
|  | 237     } | 
|  | 238     if (is.numeric(x) && is.numeric(newrange)) { | 
|  | 239         xna <- is.na(x) | 
|  | 240         if (all(xna)) | 
|  | 241             return(x) | 
|  | 242         if (any(xna)) | 
|  | 243             xrange <- range(x[!xna]) else xrange <- range(x) | 
|  | 244         if (xrange[1] == xrange[2]) | 
|  | 245             return(x) | 
|  | 246         mfac <- (newrange[2] - newrange[1])/(xrange[2] - xrange[1]) | 
|  | 247         return(newrange[1] + (x - xrange[1]) * mfac) | 
|  | 248     } else { | 
|  | 249         warning("Only numeric objects can be rescaled") | 
|  | 250         return(x) | 
|  | 251     } | 
|  | 252 } | 
|  | 253 | 
|  | 254 add_value_to_nodes = function(tr, value_names = c("nhits", "domains", "total_weight", | 
|  | 255     "proportion")) { | 
|  | 256     ## propagate values from leaves to nodes, assume that nodes are not defined | 
|  | 257     ## (either NA or NULL) leaves coud be be either numeric of list, if list values | 
|  | 258     ## are summed up based on the names | 
|  | 259     for (vn in value_names) { | 
|  | 260         for (i in Traverse(tr)) { | 
|  | 261           w = add_leaves_value(i$leaves, vn) | 
|  | 262           i[[vn]] = w | 
|  | 263         } | 
|  | 264     } | 
|  | 265 } | 
|  | 266 | 
|  | 267 add_leaves_value = function(x, name) { | 
|  | 268     ## check if annotation is multivalue or single value, propagates values from | 
|  | 269     ## leaves to root | 
|  | 270     n = unique(unlist(lapply(lapply(x, "[[", name), names))) | 
|  | 271     if (is.null(n)) { | 
|  | 272       ## no names given, we can sum everything to one value | 
|  | 273         total = sum(unlist(sapply(x, "[[", name)), na.rm = TRUE) | 
|  | 274         return(total) | 
|  | 275     } else { | 
|  | 276         total = numeric(length(n)) | 
|  | 277         names(total) = n | 
|  | 278         xvals = lapply(x, "[[", name) | 
|  | 279         N = 0 | 
|  | 280         for (i in xvals) { | 
|  | 281             for (j in names(i)) { | 
|  | 282                 total[j] = total[j] + i[j] | 
|  | 283                 N = N + 1 | 
|  | 284             } | 
|  | 285         } | 
|  | 286         return(total) | 
|  | 287     } | 
|  | 288 } | 
|  | 289 | 
|  | 290 | 
|  | 291 trmap = function(tr, xl = 0, yb = 0, xr = 1, yt = 1, first = TRUE, vertical = TRUE) { | 
|  | 292     # treemap for data.tree plotting - experimental | 
|  | 293     if (first) { | 
|  | 294         plot(xl:xr, yb:yt, type = "n", axes = FALSE, xlab = "", ylab = "") | 
|  | 295     } | 
|  | 296     if (tr$nhits > 0) { | 
|  | 297         width = 2 * (6 - tr$level) | 
|  | 298         rect(xl, yb, xr, yt, col = "#00008805", lwd = width, border = "#00000080") | 
|  | 299         if (tr$level != 1) { | 
|  | 300             text(x = (xl + xr)/2, y = (yb + yt)/2, labels = tr$name, cex = width/4) | 
|  | 301         } | 
|  | 302     } else { | 
|  | 303         return(NULL) | 
|  | 304     } | 
|  | 305     Nchildren = length(tr$children) | 
|  | 306     cat("children:", tr$name, tr$level, "\n") | 
|  | 307     if (Nchildren == 0) { | 
|  | 308         return(NULL) | 
|  | 309     } | 
|  | 310     sizes = sapply(tr$children, "[[", "nhits") | 
|  | 311     rng = c(sizes, 0) / sum(sizes) | 
|  | 312     if (vertical) { | 
|  | 313         ## x does not change | 
|  | 314         xl2 = rep(xl, Nchildren) | 
|  | 315         xr2 = rep(xr, Nchildren) | 
|  | 316         yb2 = rescale(rng, c(yb, yt))[1:(Nchildren + 1)] | 
|  | 317         yt2 = rescale(rng, c(yb, yt))[-1] | 
|  | 318     } else { | 
|  | 319         yb2 = rep(yb, Nchildren) | 
|  | 320         yt2 = rep(yt, Nchildren) | 
|  | 321         xr2 = rescale(rng, c(xl, xr))[1:(Nchildren + 1)] | 
|  | 322         xl2 = rescale(rng, c(xl, xr))[-1] | 
|  | 323     } | 
|  | 324     for (i in 1:Nchildren) { | 
|  | 325         trmap(tr$children[[i]], xl2[i], yb2[i], xr2[i], yt2[i], first = FALSE, vertical = !vertical) | 
|  | 326     } | 
|  | 327 | 
|  | 328 } | 
|  | 329 | 
|  | 330 filter_tree = function(tr) { | 
|  | 331     ## keep only nodes with positive nhits values | 
|  | 332     ## must be used on trees where leaves were already propagated | 
|  | 333     ## using add_values_to_nodes | 
|  | 334     tr_filtered = Clone(tr) | 
|  | 335     Prune(tr_filtered, function(x) x$nhits > 0 | containLTR(x)) | 
|  | 336     return(tr_filtered) | 
|  | 337 } | 
|  | 338 | 
|  | 339 containLTR = function(tr){ | 
|  | 340   for (i in Traverse(tr)){ | 
|  | 341     if (i$features == "LTR/PBS"){ | 
|  | 342       return(TRUE) | 
|  | 343     } | 
|  | 344   } | 
|  | 345   return(FALSE) | 
|  | 346 } | 
|  | 347 | 
|  | 348 filter_tree2 = function(tr) { | 
|  | 349     tr_filtered = Clone(tr) | 
|  | 350     Prune(tr_filtered, function(x) x$parent$nhits > 0) | 
|  | 351     return(tr_filtered) | 
|  | 352 } | 
|  | 353 | 
|  | 354 ## this is for testing purposesm in futurem each node will have its function | 
|  | 355 ## named find_best_hit, it will work like decisin tree. | 
|  | 356 ## functions will be set manually based on training data | 
|  | 357 formatWidth =  function(x, w){ | 
|  | 358     if (length (dim(x)) > 1){ | 
|  | 359         for (i in seq_along(x)){ | 
|  | 360             delta = nchar(x[,i], type="bytes") - nchar(x[,i], type="char") | 
|  | 361             ## delta correction is neccessary for utf-8 correct formating | 
|  | 362             x[,i] = sprintf(paste0("%-",w[i] + delta,"s"), x[,i]) | 
|  | 363         } | 
|  | 364         return(x) | 
|  | 365     }else{ | 
|  | 366         delta = nchar(x, type="bytes") - nchar(x, type="char") | 
|  | 367         sprintf(paste0("%",w + delta,"s"), | 
|  | 368             x) %>% return | 
|  | 369     } | 
|  | 370 } | 
|  | 371 | 
|  | 372 | 
|  | 373 find_best_hit_repeat = function(cls_tree){ | 
|  | 374   ## three children: | 
|  | 375   ## repeat | 
|  | 376   ##  |--rDNA | 
|  | 377   ##  |--satellite      this need special handling, rank 1 or 2 | 
|  | 378   ##  |--mobile_element | 
|  | 379   ## | 
|  | 380   ## in this case we require that satellite has proportion 0.95 | 
|  | 381   ## params of good hit | 
|  | 382   min_prop_of_positive = 0.7 | 
|  | 383   min_prop_second_best = 0.9 | 
|  | 384   min_proportion_x_nhits = 2.5  # based on 0.05 x 50 | 
|  | 385   min_satellite_proportion = 0.95 | 
|  | 386   if (isLeaf(cls_tree)) { | 
|  | 387     return(cls_tree) | 
|  | 388   } | 
|  | 389   cond1 = cls_tree$nhits * cls_tree$proportion < min_proportion_x_nhits | 
|  | 390   if (cond1){ | 
|  | 391     return(cls_tree) | 
|  | 392   } | 
|  | 393   nhits = sapply(cls_tree$children, "[[", "nhits") | 
|  | 394   all_hits = cls_tree$root$nhits | 
|  | 395   name_of_best_hit = cls_tree$children[[which.max(nhits)]]$name | 
|  | 396   best_hit_child = cls_tree$children[[which.max(nhits)]] | 
|  | 397   second_max_hits = ifelse(length(nhits) == 1, 0, max(nhits[-which.max(nhits)])) | 
|  | 398   cond2 = max(nhits) / sum(nhits) > min_prop_of_positive | 
|  | 399   cond3 = max(nhits) / (second_max_hits + max(nhits)) > min_prop_second_best | 
|  | 400   cond_satellite = best_hit_child$proportion > min_satellite_proportion & name_of_best_hit == "satellite" | 
|  | 401   cond_other = ! name_of_best_hit == "satellite" | 
|  | 402   cat(cond2, cond3, cond_satellite, cond_other, "\n") | 
|  | 403   if (cond2 & cond3 & (cond_satellite | cond_other)) { | 
|  | 404     ## clear case satellite or other | 
|  | 405     if ("find_best_hit" %in% names(best_hit_child)){ | 
|  | 406       ## custom rules for node is defined | 
|  | 407       best_hit = best_hit_child$find_best_hit(cls_tree$children[[which.max(nhits)]]) | 
|  | 408     }else{ | 
|  | 409                                         # use generic rules | 
|  | 410       best_hit = find_best_hit(cls_tree$children[[which.max(nhits)]]) | 
|  | 411     } | 
|  | 412   }else{ | 
|  | 413     ## do more specific tests for rDNA, or mobile element | 
|  | 414     ## rDNA o mobile_elements must be above threshold! | 
|  | 415     cond_sat_rank = any(cls_tree$satellite$tandem_rank == 2) & cls_tree$satellite$nhits != 0 | 
|  | 416     cond_rdna = cls_tree$rDNA$nhits *  cls_tree$rDNA$proportion > min_proportion_x_nhits | 
|  | 417     cond_mobile = cls_tree$mobile_element$nhits * cls_tree$mobile_element$proportion  > min_proportion_x_nhits | 
|  | 418     cat(cond_sat_rank, "\n") | 
|  | 419     cat(cond_rdna,"\n") | 
|  | 420     cat(cond_mobile, "\n") | 
|  | 421     if (cond_sat_rank  & (cond_rdna | cond_mobile) ){ | 
|  | 422       cls_tree$satellite$nhits = 0 | 
|  | 423       best_hit = find_best_hit(cls_tree) | 
|  | 424     }else{ | 
|  | 425       return(cls_tree) | 
|  | 426     } | 
|  | 427   } | 
|  | 428   return(best_hit) | 
|  | 429 } | 
|  | 430 | 
|  | 431 find_best_hit = function(cls_tree){ | 
|  | 432     ## general params of good hit | 
|  | 433     min_prop_of_positive = 0.7 | 
|  | 434     min_prop_second_best = 0.8 | 
|  | 435     min_proportion_x_nhits = 2.5   # based on 0.05 x 50 | 
|  | 436     if (isLeaf(cls_tree)) { | 
|  | 437         return(cls_tree) | 
|  | 438     } | 
|  | 439 | 
|  | 440 | 
|  | 441     cond1 = cls_tree$nhits * cls_tree$proportion < min_proportion_x_nhits | 
|  | 442 | 
|  | 443     if (cond1){ | 
|  | 444         ## return if proportions is lower than threshold | 
|  | 445         return(cls_tree) | 
|  | 446     } | 
|  | 447 | 
|  | 448     nhits = sapply(cls_tree$children, "[[", "nhits") | 
|  | 449     best_hit_child = cls_tree$children[[which.max(nhits)]] | 
|  | 450   ## more special cases: | 
|  | 451     second_max_hits = ifelse(length(nhits) == 1, 0, max(nhits[-which.max(nhits)])) | 
|  | 452     cond2 = max(nhits) / sum(nhits) > min_prop_of_positive | 
|  | 453     cond3 = max(nhits) / (second_max_hits + max(nhits)) > min_prop_second_best | 
|  | 454     if (cond2 & cond3) { | 
|  | 455         if ("find_best_hit" %in% names(best_hit_child)){ | 
|  | 456             ## custom rules for node is defined | 
|  | 457             best_hit = best_hit_child$find_best_hit(cls_tree$children[[which.max(nhits)]]) | 
|  | 458         }else{ | 
|  | 459             # use generic rules | 
|  | 460             best_hit = find_best_hit(cls_tree$children[[which.max(nhits)]]) | 
|  | 461         } | 
|  | 462     } else { | 
|  | 463         return(cls_tree) | 
|  | 464     } | 
|  | 465     return(best_hit) | 
|  | 466 } | 
|  | 467 | 
|  | 468 | 
|  | 469 get_annotation_groups = function(tr){ | 
|  | 470     tr_all = Clone(tr) | 
|  | 471     Prune(tr_all, pruneFun=function(...)FALSE) ## prune everithing -> keep root | 
|  | 472     tr_all$name = "Unclassified repeat (No evidence)" | 
|  | 473     tr_contamination = Clone(tr)$contamination | 
|  | 474     tr_organelle = Clone(tr)$organelle | 
|  | 475     tr_repeat = Clone(tr)$"repeat" | 
|  | 476     tr_repeat$name = "Unclassified_repeat (conflicting evidences)" | 
|  | 477     return( | 
|  | 478         list( | 
|  | 479             tr_repeat = tr_repeat, | 
|  | 480             tr_organelle = tr_organelle, | 
|  | 481             tr_all = tr_all, | 
|  | 482             tr_contamination = tr_contamination | 
|  | 483         ) | 
|  | 484     ) | 
|  | 485 } | 
|  | 486 | 
|  | 487 | 
|  | 488 | 
|  | 489 format_tree = function(cls_tree, ...) { | 
|  | 490     df = ToDataFrameTree(cls_tree, ...) | 
|  | 491     ## try to get rid off unicode characters | 
|  | 492     df[,1] = gsub("\U00B0", "'", df[,1], fixed = TRUE, useBytes = TRUE) | 
|  | 493     df[,1] = gsub("\U00A6", "|", df[,1], fixed = TRUE, useBytes = TRUE) | 
|  | 494     colnames(df)[1] = "                                               "  ## originally levelName | 
|  | 495     if ("proportion" %in% colnames(df)){ | 
|  | 496         df$proportion = signif(df$proportion,2) | 
|  | 497     } | 
|  | 498     if ("Proportion[%]" %in% colnames(df)){ | 
|  | 499         df$"Proportion[%]" = round(df$"Proportion[%]", 2) | 
|  | 500     } | 
|  | 501     ## format header | 
|  | 502     w1 = apply(df, 2, function(x) max(nchar(x))) | 
|  | 503     w2 = nchar(colnames(df)) | 
|  | 504     # use whatever with is bigger for formating | 
|  | 505     w = ifelse(w1 > w2, w1, w2) | 
|  | 506 | 
|  | 507     out = character() | 
|  | 508     header = mapply(FUN = function(x, y) formatWidth(x, w = y), colnames(df), w) %>% | 
|  | 509         paste0(collapse=" | ") | 
|  | 510     hr_line = gsub(".","-",header) | 
|  | 511     # create output | 
|  | 512     # classification lines | 
|  | 513     class_lines = formatWidth(df, w) %>% | 
|  | 514         apply(1, FUN = function(x) paste0(x,collapse = " | ")) %>% | 
|  | 515         paste(collapse = "\n") | 
|  | 516     paste( | 
|  | 517         header, | 
|  | 518         hr_line, | 
|  | 519         class_lines, | 
|  | 520         sep="\n") %>% return | 
|  | 521 } | 
|  | 522 | 
|  | 523 | 
|  | 524 make_final_annotation_template = function( | 
|  | 525                                           annot_attributes = list( | 
|  | 526                                               Nreads = 0, | 
|  | 527                                               Nclusters = 0, | 
|  | 528                                               Nsuperclusters = 0, | 
|  | 529                                               "Proportion[%]" = 0, | 
|  | 530                                               cluster_list = c()) | 
|  | 531                                           ) | 
|  | 532 { | 
|  | 533     ct = Clone(CLS_TREE) | 
|  | 534     for (i in Traverse(ct)){ | 
|  | 535         for (j in names(annot_attributes)){ | 
|  | 536             i[[j]] = annot_attributes[[j]] | 
|  | 537         } | 
|  | 538     } | 
|  | 539     return(ct) | 
|  | 540 } | 
|  | 541 | 
|  | 542 | 
|  | 543 common_ancestor = function(tr1, tr2){ | 
|  | 544   a1 = tr1$Get('name', traversal = "ancestor") | 
|  | 545   a2 = tr2$Get('name', traversal = "ancestor") | 
|  | 546   ancestor = intersect(a1,a2)[1] | 
|  | 547   return(ancestor) | 
|  | 548 } | 
|  | 549 | 
|  | 550 create_all_superclusters_report = function(max_supercluster = 100, | 
|  | 551                                            paths, | 
|  | 552                                            libdir, | 
|  | 553                                            superclusters_dir, | 
|  | 554                                            seqdb, hitsortdb, | 
|  | 555                                            classification_hierarchy_file, | 
|  | 556                                            HTML_LINKS) | 
|  | 557 { | 
|  | 558     ## connect to sqlite databases | 
|  | 559     ## seqdb and hitsortdb are path to sqlite files | 
|  | 560     HTML_LINKS = nested2named_list(HTML_LINKS) | 
|  | 561     paths = nested2named_list(paths) | 
|  | 562     report_file = paths[['supercluster_report_html']] | 
|  | 563     ## create SEQDB, HTISORTDB and CLS_TREE | 
|  | 564     connect_to_databases(seqdb, hitsortdb, classification_hierarchy_file) | 
|  | 565 | 
|  | 566     ## append specific classication rules | 
|  | 567     CLS_TREE$"repeat"$find_best_hit = find_best_hit_repeat | 
|  | 568 | 
|  | 569     ### | 
|  | 570   cat("Superclusters summary\n", file = report_file) | 
|  | 571   cat("---------------------\n\n", file = report_file, append = TRUE) | 
|  | 572     empty = rep(NA,max_supercluster) | 
|  | 573     SC_table = data.frame( | 
|  | 574         Supercluster = 1 : max_supercluster, "Number of reads" = empty, Automatic_annotation = empty, | 
|  | 575         Similarity_hits = empty, TAREAN_annotation = empty, | 
|  | 576         Clusters = empty, stringsAsFactors = FALSE, check.names = FALSE | 
|  | 577     ) | 
|  | 578     SC_csv = SC_table  # table as spreadsheet - no html formatings | 
|  | 579     final_cluster_annotation = make_final_annotation_template() | 
|  | 580 | 
|  | 581   for (sc in 1 : max_supercluster) { | 
|  | 582         cat("supercluster: ",sc,"\n") | 
|  | 583         cls_tree = read_annotation_to_tree(sc) | 
|  | 584         sc_summary = get_supercluster_summary(sc) | 
|  | 585         add_value_to_nodes(cls_tree) | 
|  | 586         cls_tree_filtered = filter_tree(cls_tree) | 
|  | 587         best_hit = find_best_hit(cls_tree) | 
|  | 588         ## exception to decision tree put here. | 
|  | 589         ## All -> class I ? | 
|  | 590         if (best_hit$name == "All"){ | 
|  | 591           ## check LTR | 
|  | 592           if (any(!(unique(cls_tree$ltr_info$ltr_detection) %in% "None"))){ | 
|  | 593             ## if LTR found - move classification to Class I | 
|  | 594 | 
|  | 595             LTR = FindNode(best_hit, "LTR") | 
|  | 596             nhits_without_class_I = best_hit$nhits - LTR$nhits | 
|  | 597             prop_without_class_I = best_hit$proportion - LTR$proportion | 
|  | 598             cond3 = nhits_without_class_I * prop_without_class_I < 1 | 
|  | 599             cond2 = best_hit$nhits * best_hit$proportion < 0.5  # other hits weak | 
|  | 600             cond1 = LTR$nhits >= (0.95 * best_hit$nhits) # hits are pre in sub class_I | 
|  | 601 | 
|  | 602             if (cond1 | cond2 | cond3){ | 
|  | 603               best_hit = LTR | 
|  | 604             } | 
|  | 605           } | 
|  | 606         }else{ | 
|  | 607           ## check is conflict os best hit  with LTR/PBS exists | 
|  | 608           if (any(!(unique(cls_tree$ltr_info$ltr_detection) %in% "None"))){ | 
|  | 609             LTR = FindNode(cls_tree, "LTR") | 
|  | 610             ca_name = common_ancestor(LTR, best_hit) | 
|  | 611             if (ca_name != "LTR"){ | 
|  | 612               ## reclassify | 
|  | 613               best_hit = FindNode(cls_tree, ca_name) | 
|  | 614             } | 
|  | 615           } | 
|  | 616         } | 
|  | 617 | 
|  | 618         best_hit_name = best_hit$name | 
|  | 619         best_hit_path = paste(best_hit$path, collapse = "/") | 
|  | 620         ## add best hit do database: | 
|  | 621         sqlcmd = paste0( | 
|  | 622             "UPDATE cluster_info SET supercluster_best_hit = \"", best_hit_path, "\" ", | 
|  | 623             " WHERE supercluster = ", sc | 
|  | 624         ) | 
|  | 625         dbExecute(HITSORTDB, sqlcmd) | 
|  | 626         # add annotation annotation summary stored in final_cluster_annotation - summing up | 
|  | 627         best_hit_node = FindNode(final_cluster_annotation, best_hit_name) | 
|  | 628         best_hit_node$Nsuperclusters = best_hit_node$Nsuperclusters + 1 | 
|  | 629         best_hit_node$Nclusters = best_hit_node$Nclusters + sc_summary$Nclusters | 
|  | 630         best_hit_node$Nreads = best_hit_node$Nreads + sc_summary$Nreads | 
|  | 631         best_hit_node$"Proportion[%]" = best_hit_node$"Proportion[%]" + sc_summary$proportion*100 | 
|  | 632         best_hit_node$cluster_list = append(best_hit_node$cluster_list, sc_summary$cluster_list) | 
|  | 633         best_hit_label = ifelse(best_hit_name == "All", "NA", best_hit_name) | 
|  | 634         SC_csv$Automatic_annotation[sc] = best_hit_label | 
|  | 635         SC_csv$"Number of reads"[sc] = SC_table$"Number of reads"[sc] = cls_tree$size | 
|  | 636         SC_csv$Similarity_hits[sc] = format_tree(cls_tree_filtered, | 
|  | 637                                                              "nhits", "proportion", "features") | 
|  | 638         SC_table$Similarity_hits[sc] = format_tree(cls_tree_filtered, | 
|  | 639                                                                "nhits", "proportion", "features") %>% | 
|  | 640             preformatted | 
|  | 641 | 
|  | 642         SC_table$Automatic_annotation[sc] = best_hit_label | 
|  | 643 | 
|  | 644         G = get_supercluster_graph( | 
|  | 645             sc=sc, seqdb = seqdb,hitsortdb = hitsortdb, | 
|  | 646             classification_hierarchy_file = classification_hierarchy_file | 
|  | 647         ) | 
|  | 648         ## append tarean annotation | 
|  | 649         clist = paste(V(G)$name, collapse =",") | 
|  | 650         cl_rank = dbGetQuery(HITSORTDB, | 
|  | 651                              paste("SELECT [index], tandem_rank FROM cluster_info WHERE [index] IN (",clist, | 
|  | 652                                    ") AND tandem_rank IN (1,2)")) | 
|  | 653         ## add information about TR clusters is any | 
|  | 654         if (nrow(cl_rank) > 0){ | 
|  | 655             tarean_annot = paste( | 
|  | 656                 cl_rank$index, "-", | 
|  | 657                 RANKS_TANDEM[cl_rank$tandem_rank] | 
|  | 658             ) | 
|  | 659             SC_csv$TAREAN_annotation[sc] = paste(tarean_annot,collapse="\n") | 
|  | 660             SC_table$TAREAN_annotation[sc] = mapply( | 
|  | 661                 hwrite, tarean_annot, | 
|  | 662                 link = sprintf(HTML_LINKS$ROOT_TO_TAREAN,cl_rank$index)) %>% paste(collapse="<br>") | 
|  | 663         } | 
|  | 664         ## add links to clusters | 
|  | 665         SC_csv$Clusters[sc] = paste((V(G)$name), collapse = ",") | 
|  | 666         links = mapply(hwrite, V(G)$name, link = sprintf(HTML_LINKS$ROOT_TO_CLUSTER, as.integer(V(G)$name))) | 
|  | 667         SC_table$Clusters[sc] = paste(links, collapse =", ") | 
|  | 668         create_single_supercluster_report(G, superclusters_dir) | 
|  | 669     } | 
|  | 670 | 
|  | 671 | 
|  | 672 | 
|  | 673     ## add html links | 
|  | 674     SC_table$Supercluster = mapply(hwrite, SC_table$Supercluster, | 
|  | 675                                    link = sprintf(HTML_LINKS$ROOT_TO_SUPERCLUSTER, | 
|  | 676                                                   SC_table$Supercluster)) | 
|  | 677 | 
|  | 678 | 
|  | 679 | 
|  | 680     write.table(SC_csv, file = paths[["superclusters_csv_summary"]], | 
|  | 681                 sep = "\t", row.names = FALSE) | 
|  | 682 | 
|  | 683 | 
|  | 684     DT_instance = datatable(SC_table, escape = FALSE, options = DT_OPTIONS) | 
|  | 685     saveWidget(DT_instance, file = normalizePath(report_file), | 
|  | 686                normalizePath(libdir), selfcontained = FALSE) | 
|  | 687     add_preamble(normalizePath(report_file), | 
|  | 688                  preamble='<h2>Supercluster annotation</h2> <p><a href="documentation.html#superclust"> For table legend see documentation. <a> </p>') | 
|  | 689 | 
|  | 690     annot_groups = get_annotation_groups(final_cluster_annotation) | 
|  | 691     saveRDS(object=annot_groups, file = paths[['repeat_annotation_summary_rds']]) | 
|  | 692     final_cluster_annotation_formated = paste( | 
|  | 693         sapply(annot_groups, format_tree, 'Proportion[%]', 'Nsuperclusters', 'Nclusters', "Nreads"), | 
|  | 694         collapse = "\n\n\n" | 
|  | 695     ) | 
|  | 696     ## TODO add singleton counts, total counts and extra text - csv output | 
|  | 697     ## export annotation summary | 
|  | 698     html_summary = start_html(paths[['summarized_annotation_html']], gsub("PAGE_TITLE", "Repeat Annotation Summary", HTMLHEADER)) | 
|  | 699     html_summary("Repeat annotation summary", HTML.title, HR=2) | 
|  | 700     html_summary("This table summarizes the automatic annotations of superclusters that should be verified and manually corrected if necessary. Thus, the table should not be used as the final output of the analysis without critical evaluation.", HTML.title, HR=3) | 
|  | 701     html_summary(preformatted(final_cluster_annotation_formated), cat) | 
|  | 702 | 
|  | 703 | 
|  | 704 | 
|  | 705     return() | 
|  | 706 } | 
|  | 707 ## for testing | 
|  | 708 if (FALSE){ | 
|  | 709     create_supercluster_report(1:100, report_file, seqdb, hitsortdb, class_file) | 
|  | 710 } | 
|  | 711 | 
|  | 712 create_single_supercluster_report = function(G, superclusters_dir){ | 
|  | 713     sc_dir = paste0(superclusters_dir, "/dir_SC",sprintf("%04d", G$name)) | 
|  | 714     htmlfile = paste0(sc_dir, "/index.html") | 
|  | 715     dir.create(sc_dir) | 
|  | 716     title = paste("Supercluster no. ",G$name) | 
|  | 717     htmlheader = gsub("PAGE_TITLE", title, HTMLHEADER) | 
|  | 718     cat(htmlheader, file = htmlfile) | 
|  | 719     file.copy(paste0(WD,"/style1.css"), sc_dir) | 
|  | 720     HTML.title(title, file = htmlfile) | 
|  | 721     HTML("Simmilarity hits (only hits with proportion above 0.1% in at least one cluster are shown) ", file = htmlfile) | 
|  | 722     img_file = paste0(sc_dir,"/SC",sprintf("%0d",G$name), ".png") | 
|  | 723     png(filename = img_file, width = 1400, height=1200) | 
|  | 724     coords = plot_supercluster(G = G) | 
|  | 725     dev.off() | 
|  | 726     ## basename - make link relative | 
|  | 727     href = paste0("../../clusters/dir_CL",sprintf("%04d", as.numeric(V(G)$name)),"/index.html") | 
|  | 728     html_insert_image(basename(img_file), htmlfile,coords=coords, href = href) | 
|  | 729 | 
|  | 730     if (is_comparative()){ | 
|  | 731         HTML("Comparative analysis", file = htmlfile) | 
|  | 732         img_file = paste0(sc_dir,"/SC",sprintf("%0d",G$name), "comparative.png") | 
|  | 733         png(filename = img_file, width = 1400, height=1200) | 
|  | 734         coords = plot_supercluster(G = G, "comparative") | 
|  | 735         mtext("comparative analysis") | 
|  | 736         dev.off() | 
|  | 737         ## basename - make link relative | 
|  | 738         href = paste0("../../clusters/dir_CL",sprintf("%04d", as.numeric(V(G)$name)),"/index.html") | 
|  | 739         html_insert_image(basename(img_file), htmlfile,coords=coords, href = href) | 
|  | 740 | 
|  | 741     } | 
|  | 742 } | 
|  | 743 | 
|  | 744 html_insert_image = function(img_file, htmlfile, coords=NULL, href=NULL) { | 
|  | 745     img_tag = sprintf('<p> <img src="%s" usemap="#clustermap" > \n</p>\n<br>\n', img_file) | 
|  | 746     cat(img_tag, file = htmlfile, append = TRUE) | 
|  | 747     if (!is.null(coords)){ | 
|  | 748         formated_links = character() | 
|  | 749         for (i in seq_along(href)){ | 
|  | 750             formated_links[i] = sprintf('<area shape="circle" coords="%f,%f,%f" href="%s" >\n', | 
|  | 751                                         coords[i,1],coords[i,2],coords[,3],href[i]) | 
|  | 752         } | 
|  | 753         cat('<map name="clustermap" >\n', | 
|  | 754             formated_links,"</map>\n", file=htmlfile, append = TRUE) | 
|  | 755 | 
|  | 756     } | 
|  | 757 } | 
|  | 758 | 
|  | 759 html_insert_floating_image = function(img_file, | 
|  | 760                                       htmlfile, | 
|  | 761                                       width=NULL, | 
|  | 762                                       title= "", | 
|  | 763                                       footer = "" | 
|  | 764                                       ){ | 
|  | 765     if (is.null(width)){ | 
|  | 766         width="" | 
|  | 767     } | 
|  | 768     tag = paste( | 
|  | 769         '<div class="floating_img">\n', | 
|  | 770         '  <p>', title,'</p>\n', | 
|  | 771         '   <a href=',img_file, ">", | 
|  | 772         '    <img src="',img_file, '" alt="image" width="',width,'">\n', | 
|  | 773         '   </a>', | 
|  | 774         '  <p> ',footer,'</p>\n', | 
|  | 775         '</div>\n' | 
|  | 776        ,sep = "") | 
|  | 777     cat(tag, file = htmlfile, append = TRUE) | 
|  | 778 } | 
|  | 779 | 
|  | 780 get_supercluster_graph = function(sc, seqdb, hitsortdb, classification_hierarchy_file){ | 
|  | 781     ## sc - superclusted index | 
|  | 782     ## seqdb, hitsortdb - path to sqlite databases | 
|  | 783     ## classificationn_hierarchy_file  - path data.tree rds file | 
|  | 784     ## SIZE of image | 
|  | 785     SIZE = 1000 | 
|  | 786     connect_to_databases(seqdb, hitsortdb, classification_hierarchy_file) | 
|  | 787     clusters = dbGetQuery(HITSORTDB, | 
|  | 788                           paste0("SELECT cluster, size FROM superclusters WHERE supercluster=",sc) | 
|  | 789                           ) %>% as.data.frame | 
|  | 790     ## string for sql query: | 
|  | 791     cluster_list = paste0( "(", paste0(clusters$cluster, collapse = ","),")") | 
|  | 792     supercluster_ncol = dbGetQuery(HITSORTDB, | 
|  | 793                                    paste("SELECT c1,c2,w, k FROM cluster_mate_bond where c1 IN ", | 
|  | 794                                          cluster_list, " AND c2 IN ", cluster_list, "AND k > 0.1" | 
|  | 795                                          ) | 
|  | 796                                    ) | 
|  | 797     if (nrow(supercluster_ncol)>0){ | 
|  | 798         ## at least two clusters | 
|  | 799         G = graph.data.frame(supercluster_ncol, directed=FALSE) | 
|  | 800         L = layout_with_kk(G) | 
|  | 801         ## layout is rescaled for easier linking html | 
|  | 802         L[,1] = scales::rescale(L[,1], to = c(1,SIZE) ) | 
|  | 803         L[,2] = scales::rescale(L[,2], to = c(1,SIZE) ) | 
|  | 804         G$L = L | 
|  | 805     }else{ | 
|  | 806         ## only one cluster in supercluster | 
|  | 807         G = graph.full(n = 1) | 
|  | 808         V(G)$name = as.character(clusters$cluster) | 
|  | 809         G$L = matrix(c(SIZE/2, SIZE/2), ncol=2) | 
|  | 810     } | 
|  | 811     G = set_vertex_attr(G, "label", value = paste0("CL",names(V(G)))) | 
|  | 812     G$name=sc ## names is supercluster | 
|  | 813     # create annotation of individual clusters, will be attached as node attribudes | 
|  | 814     annot = get_cluster_annotation_summary(clusters) | 
|  | 815     ## annotation of nodes(clusters) | 
|  | 816     G = set_vertex_attr(G, "annotation", value = annot$clusters[names(V(G))]) | 
|  | 817     G = set_vertex_attr(G, "size", | 
|  | 818                         value = clusters$size[match(names(V(G)), as.character(clusters$cluster))]) | 
|  | 819     G$annotation = annot$supercluster | 
|  | 820     G$clusters = clusters | 
|  | 821     # TODO if comparative analysis - add proportion of species | 
|  | 822     if (is_comparative()){ | 
|  | 823         comparative_counts = get_cluster_comparative_counts(cluster_list) | 
|  | 824         NACluster = comparative_counts$supercluster | 
|  | 825         # some clusters do not have comparative data - they are bellow threshold! | 
|  | 826         NACluster[,] = NA | 
|  | 827         counts_adjusted = comparative_counts$clusters[names(V(G))] | 
|  | 828         for (i in names(V(G))){ | 
|  | 829             if(is.null(counts_adjusted[[i]])){ | 
|  | 830                 ## get count from database | 
|  | 831                 seqid = dbGetQuery(HITSORTDB, | 
|  | 832                                    paste( | 
|  | 833                                        "SELECT vertexname FROM vertex_cluster where cluster = ", | 
|  | 834                                        i) | 
|  | 835                                    )$vertexname | 
|  | 836                 codes = get_comparative_codes()$prefix | 
|  | 837                 x = table(factor(substring(seqid, 1,nchar(codes[1])), levels = codes)) | 
|  | 838                 counts_adjusted[[i]] = data.frame( | 
|  | 839                     Freq = as.numeric(x), | 
|  | 840                     proportion = as.numeric(x/sum(x)), | 
|  | 841                     row.names = names(x)) | 
|  | 842 | 
|  | 843             } | 
|  | 844         } | 
|  | 845 | 
|  | 846         G = set_vertex_attr(G, "comparative_counts", | 
|  | 847                             value = counts_adjusted[V(G)$name] | 
|  | 848                             ) | 
|  | 849         # for whole supercluster | 
|  | 850         G$comparative = comparative_counts$supercluster | 
|  | 851     } | 
|  | 852 | 
|  | 853     return(G) | 
|  | 854 } | 
|  | 855 | 
|  | 856 get_cluster_comparative_counts = function(cluster_list){ | 
|  | 857     counts = dbGetQuery( | 
|  | 858         HITSORTDB, | 
|  | 859         paste0("SELECT * FROM comparative_counts WHERE clusterindex IN", | 
|  | 860                cluster_list | 
|  | 861                ) | 
|  | 862     ) | 
|  | 863     comparative_counts = apply(counts, 1, function(x) | 
|  | 864         y = data.frame(Freq = x[-1], proportion = x[-1]/sum(x[-1])) | 
|  | 865     ) | 
|  | 866     names(comparative_counts) = counts$clusterindex | 
|  | 867     total_counts = data.frame( | 
|  | 868         Freq = colSums(counts[,-1]), | 
|  | 869         proportion = colSums(counts[,-1])/sum(counts[,-1])) | 
|  | 870     return( | 
|  | 871         list( | 
|  | 872             clusters = comparative_counts, | 
|  | 873             supercluster = total_counts | 
|  | 874             ) | 
|  | 875     ) | 
|  | 876 | 
|  | 877 } | 
|  | 878 | 
|  | 879 get_cluster_annotation_summary = function(clusters){ | 
|  | 880     ## clusters - table of clusters, col: cluster, size | 
|  | 881     annot_list = apply(clusters,1,FUN = function(x)summarize_annotation(cluster_annotation(x[1]),x[2])) | 
|  | 882     ## if empty - not annotation at all | 
|  | 883     if (sum(sapply(annot_list, nrow)) == 0){ | 
|  | 884         return(list (clusters = NULL, | 
|  | 885                  superclusters = NULL | 
|  | 886                  )) | 
|  | 887     } | 
|  | 888     names(annot_list) = as.character(clusters$cluster) | 
|  | 889     annot_all = do.call(rbind,annot_list) | 
|  | 890     total = by(annot_all$Freq,INDICES=with(annot_all, paste(cl_string,domain)), FUN=sum, simplify=TRUE) | 
|  | 891     total_df = data.frame(Freq = c(total), proportion = c(total) /sum(clusters$size)) | 
|  | 892     ## for ploting it need to contain all annot categories as in s upercluster | 
|  | 893     annot_list_full = list() | 
|  | 894     for (i in seq_along(annot_list)){ | 
|  | 895         annot_list_full[[i]] = total_df | 
|  | 896         for (j in rownames(total_df)){ | 
|  | 897             if (!j %in% rownames(annot_list[[i]])){ | 
|  | 898                 annot_list_full[[i]][j,c('Freq','proportion')] = c(0,0) | 
|  | 899             }else{ | 
|  | 900                 annot_list_full[[i]][j,c('Freq','proportion')] = annot_list[[i]][j,c('Freq','proportion')] | 
|  | 901             } | 
|  | 902         } | 
|  | 903     } | 
|  | 904     names(annot_list_full) = names(annot_list) | 
|  | 905     return(list (clusters = annot_list_full, | 
|  | 906                  supercluster = total_df | 
|  | 907                  ) | 
|  | 908            ) | 
|  | 909 } | 
|  | 910 | 
|  | 911 plot_supercluster = function(G,color_scheme=c("annotation","comparative")){ | 
|  | 912     ## create plot in coords y: 1-1200, x: 1 - 1400 | 
|  | 913     ## this is fixed for href to work in exact image areas! | 
|  | 914     color_scheme = match.arg(color_scheme) | 
|  | 915     LIMS0 = matrix(c (1,1000,1,1000), ncol = 2) | 
|  | 916     OFFSET_H = 100; OFFSET_W = 200 | 
|  | 917     lims = LIMS0 + c(0,OFFSET_W * 2, 0, OFFSET_H * 2) | 
|  | 918     ## use full range | 
|  | 919     par(mar=c(0,0,0,0),xaxs="i", yaxs="i") | 
|  | 920     ## proportion of repeats | 
|  | 921     if (color_scheme == "annotation"){ | 
|  | 922         prop = sapply (V(G)$annotation, FUN = function(x)x$proportion) | 
|  | 923         brv = c("#BBBBBB",unique_colors) | 
|  | 924     }else{ | 
|  | 925         prop = sapply (V(G)$comparative_counts, FUN = function(x)x$proportion) | 
|  | 926         brv = unique_colors | 
|  | 927     } | 
|  | 928     ## handle special cases - single cluster with annot or single annotation group | 
|  | 929     if (is.null(dim(prop))){ | 
|  | 930         prop=matrix(prop, nrow = 1) | 
|  | 931     } | 
|  | 932     if (length(prop)==0){ | 
|  | 933         ## special case - not annotation at all | 
|  | 934         prop = matrix(rep(1,vcount(G)),ncol = vcount(G), dimnames=list("NAN", NULL)) | 
|  | 935     }else{ | 
|  | 936         if (color_scheme == "annotation"){ | 
|  | 937             rownames(prop) = rownames(G$annotation) | 
|  | 938         }else{ | 
|  | 939             rownames(prop) = rownames(G$comparative) | 
|  | 940         } | 
|  | 941         ## reorder by prop | 
|  | 942         if (color_scheme =="annotation"){ | 
|  | 943             prop = prop[order(prop[,1], decreasing = TRUE),,drop=FALSE] | 
|  | 944             NAN = 1 - colSums(prop) | 
|  | 945             prop = rbind(NAN, prop) | 
|  | 946             brv = c("#BBBBBB",unique_colors) | 
|  | 947         }else{ | 
|  | 948         } | 
|  | 949     } | 
|  | 950     L = G$L  + c(OFFSET_H) | 
|  | 951   ## for href - necessary convert coordinater - image is flipped vertically | 
|  | 952     include = rowSums(prop>0.001, na.rm = TRUE)>=1 | 
|  | 953     include[1] = TRUE | 
|  | 954     prop = prop[include, , drop=FALSE] | 
|  | 955     plot(0,0,type='n', , xlim = lims[,1], ylim = lims[,2]) | 
|  | 956     plot_edges(G, L, lwd = 8) | 
|  | 957     RADIUS = radius_size(G) | 
|  | 958     coords = cbind(G$L[,1,drop=FALSE] + 100, 1100 - G$L[,2,drop=FALSE], RADIUS) | 
|  | 959     pieScatter(L[,1], L[,2], t(prop), col = brv, | 
|  | 960                radius = RADIUS, new.plot = FALSE, border=NA) | 
|  | 961     ## add ledend | 
|  | 962     legend("topright", legend = rownames(prop), | 
|  | 963            col = brv, pch = 15, cex= 1.5, pt.cex= 3) | 
|  | 964     ## add cluster labels | 
|  | 965     text(x = L[,1], y = L[,2], labels = paste(V(G)$label,"\n",V(G)$size)) | 
|  | 966     return(coords) | 
|  | 967 } | 
|  | 968 | 
|  | 969 | 
|  | 970 radius_size = function(G){ | 
|  | 971     if (vcount(G) == 1) RADIUS=120 | 
|  | 972     if (vcount(G) == 2) RADIUS=50 | 
|  | 973     if (vcount(G) %in% 3:8) RADIUS=40 | 
|  | 974     if (vcount(G) > 8) RADIUS=30 | 
|  | 975     return(RADIUS) | 
|  | 976 } | 
|  | 977 | 
|  | 978 | 
|  | 979 plot_edges = function(G,L,col="#33000040", lwd = 1){ | 
|  | 980 	e = get.edgelist(G, names = F) | 
|  | 981 	X0 = L[e[, 1], 1] | 
|  | 982 	Y0 = L[e[, 1], 2] | 
|  | 983 	X1 = L[e[, 2], 1] | 
|  | 984 	Y1 = L[e[, 2], 2] | 
|  | 985 	segments(X0, Y0, X1, Y1, lwd = lwd,col = col) | 
|  | 986 } | 
|  | 987 | 
|  | 988 | 
|  | 989 pieScatter=function(x, y, prop,radius=1, col=NULL, edges=100, new.plot = TRUE, ...){ | 
|  | 990     if (new.plot){ | 
|  | 991         plot(x,y,type='n') | 
|  | 992     } | 
|  | 993     for (i in seq_along(x)){ | 
|  | 994         p=prop[i,] | 
|  | 995         if (length(radius)==1){ | 
|  | 996             r=radius | 
|  | 997         }else{ | 
|  | 998             r=radius[i] | 
|  | 999         } | 
|  | 1000         include = p != 0 | 
|  | 1001         floating.pie(x[i], y[i],p[include], | 
|  | 1002                      col=col[include], radius=r,edges=50,...) | 
|  | 1003     } | 
|  | 1004 } | 
|  | 1005 | 
|  | 1006 unique_colors = c("#00FF00", "#0000FF", "#FF0000", | 
|  | 1007                   "#FFA6FE", "#FFDB66", "#006401", "#010067", "#95003A", | 
|  | 1008                   "#007DB5", "#FF00F6", "#FFEEE8", "#774D00", "#90FB92", "#01FFFE", | 
|  | 1009                   "#0076FF", "#D5FF00", "#FF937E", "#6A826C", "#FF029D", | 
|  | 1010                   "#FE8900", "#7A4782", "#7E2DD2", "#85A900", "#FF0056", | 
|  | 1011                   "#A42400", "#00AE7E", "#683D3B", "#BDC6FF", "#263400", | 
|  | 1012                   "#BDD393", "#00B917", "#9E008E", "#001544", "#C28C9F", | 
|  | 1013                   "#FF74A3", "#01D0FF", "#004754", "#E56FFE", "#788231", | 
|  | 1014                   "#0E4CA1", "#91D0CB", "#BE9970", "#968AE8", "#BB8800", | 
|  | 1015                   "#43002C", "#DEFF74", "#00FFC6", "#FFE502", "#620E00", | 
|  | 1016                   "#008F9C", "#98FF52", "#7544B1", "#B500FF", "#00FF78", | 
|  | 1017                   "#FF6E41", "#005F39", "#6B6882", "#5FAD4E", "#A75740", | 
|  | 1018                   "#A5FFD2", "#FFB167", "#009BFF", "#E85EBE") | 
|  | 1019 | 
|  | 1020 if (FALSE){ | 
|  | 1021 ## For testing | 
|  | 1022     for ( i in 1:100){ | 
|  | 1023         G = get_supercluster_graph( | 
|  | 1024             sc=i, seqdb = seqdb, | 
|  | 1025             hitsortdb = hitsortdb, classification_hierarchy_file = class_file | 
|  | 1026         ) | 
|  | 1027         png(paste0("/mnt/raid/users/petr/tmp/test.figure_",i,".png"), width = 1400, height=1000) | 
|  | 1028         plot_supercluster(G) | 
|  | 1029         dev.off() | 
|  | 1030         print(i) | 
|  | 1031     } | 
|  | 1032 | 
|  | 1033 } | 
|  | 1034 | 
|  | 1035 plotg = function(GG,LL,wlim=NULL,...){ | 
|  | 1036     ## quick ploting function | 
|  | 1037     e = get.edgelist(GG, names=F) | 
|  | 1038     w = E(GG)$weight | 
|  | 1039     if (!is.null(wlim)) {e = e[w > wlim,]; w = w[w > wlim]} | 
|  | 1040     X0 = LL[e[,1],1] | 
|  | 1041     Y0 = LL[e[,1],2] | 
|  | 1042     X1 = LL[e[,2],1] | 
|  | 1043     Y1 = LL[e[,2],2] | 
|  | 1044     plot(range(LL[,1]),range(LL[,2]),xlab="",ylab="",axes=FALSE,type="n",...) | 
|  | 1045     brv = 'grey' | 
|  | 1046     segments(X0,Y0,X1,Y1,lwd=.5,col=brv) | 
|  | 1047     points(LL,pch=18,cex=.8,...) | 
|  | 1048 } | 
|  | 1049 | 
|  | 1050 | 
|  | 1051 | 
|  | 1052 | 
|  | 1053 get_cluster_info = function(index){ | 
|  | 1054     ## must be connected to hitsort database ( HITSORT) | 
|  | 1055     x = dbGetQuery(HITSORTDB, | 
|  | 1056                    paste0("SELECT * FROM cluster_info WHERE [index]=",index)) | 
|  | 1057     return(as.list(x)) | 
|  | 1058 } | 
|  | 1059 | 
|  | 1060 get_supercluster_info = function(sc){ | 
|  | 1061     ## must be connected to hitsort database ( HITSORT) | 
|  | 1062     x = dbGetQuery(HITSORTDB, | 
|  | 1063                    paste0("SELECT * FROM cluster_info WHERE supercluster=",sc)) | 
|  | 1064     return(x) | 
|  | 1065 } | 
|  | 1066 | 
|  | 1067 get_supercluster_summary = function(sc){ | 
|  | 1068     info = get_supercluster_info(sc) | 
|  | 1069     Nclusters = nrow(info) | 
|  | 1070     Nreads = supercluster_size(sc) | 
|  | 1071     N = dbGetQuery(SEQDB, "SELECT count(*) from sequences") | 
|  | 1072     proportion = Nreads/N | 
|  | 1073     cluster_list = info$index | 
|  | 1074     return(list( | 
|  | 1075         Nreads = Nreads, | 
|  | 1076         Nclusters = Nclusters, | 
|  | 1077         proportion = proportion, | 
|  | 1078         cluster_list = cluster_list | 
|  | 1079     )) | 
|  | 1080 } | 
|  | 1081 | 
|  | 1082 | 
|  | 1083 get_cluster_connection_info = function(index, search_type=c("pair", "similarity")){ | 
|  | 1084     search_type = match.arg(search_type) | 
|  | 1085     if (search_type == "pair"){ | 
|  | 1086         x = dbGetQuery(HITSORTDB, | 
|  | 1087                        sprintf( | 
|  | 1088                            "SELECT * FROM cluster_mate_connections WHERE c1==%d OR c2=%d", | 
|  | 1089                            index, index) | 
|  | 1090                        ) | 
|  | 1091         if (nrow(x) == 0 ){ | 
|  | 1092             ## no connections | 
|  | 1093             return(NULL) | 
|  | 1094         } | 
|  | 1095         other_cl = ifelse(x$c1 == index, x$c2, x$c1) | 
|  | 1096         out = data.frame(cl = other_cl, N = x$N) | 
|  | 1097         out$ids = unlist(strsplit(x$ids, split = ", ")) | 
|  | 1098         k = dbGetQuery(HITSORTDB, | 
|  | 1099                        sprintf( | 
|  | 1100                            "SELECT * FROM cluster_mate_bond WHERE c1==%d OR c2=%d", | 
|  | 1101                            index, index) | 
|  | 1102                        ) | 
|  | 1103         if (k$c1 !=  x$c1 || k$c2 !=x$c2){ | 
|  | 1104             ## tables not in same order | 
|  | 1105             stop("tables cluster_mate_bond and cluster_mate_connection are not in the same order") | 
|  | 1106         } | 
|  | 1107         out$k = k$k | 
|  | 1108         return(out) | 
|  | 1109     }else{ | 
|  | 1110         x = dbGetQuery(HITSORTDB, | 
|  | 1111                        sprintf( | 
|  | 1112                            "SELECT * FROM cluster_connections WHERE c1==%d OR c2=%d", | 
|  | 1113                            index, index) | 
|  | 1114                        ) | 
|  | 1115         if (nrow(x) == 0 ){ | 
|  | 1116             ## no connections | 
|  | 1117             return(NULL) | 
|  | 1118         } | 
|  | 1119         other_cl = ifelse(x$c1 == index, x$c2, x$c1) | 
|  | 1120         out = data.frame(cl = other_cl, N = x$"count(*)") | 
|  | 1121         return(out) | 
|  | 1122     } | 
|  | 1123 } | 
|  | 1124 | 
|  | 1125 | 
|  | 1126 format_clinfo=function(x){ | 
|  | 1127     include = c( | 
|  | 1128         "size", | 
|  | 1129         "size_real", | 
|  | 1130         "ecount", | 
|  | 1131         "supercluster", | 
|  | 1132         "annotations_summary", | 
|  | 1133         "ltr_detection", | 
|  | 1134         "pair_completeness", | 
|  | 1135         "TR_score", | 
|  | 1136         "TR_monomer_length", | 
|  | 1137         "loop_index", | 
|  | 1138         "satellite_probability", | 
|  | 1139         "TR_consensus", | 
|  | 1140         "tandem_rank", | 
|  | 1141         "orientation_score" | 
|  | 1142     ) | 
|  | 1143     new_names = c( | 
|  | 1144       "TR_consensus" = "TAREAN consensus", | 
|  | 1145       "tandem_rank" = "TAREAN_annotation", | 
|  | 1146       "ltr_detection" = "PBS/LTR", | 
|  | 1147       'size' = 'number of reads in the graph', | 
|  | 1148       'size_real' = 'the total number of reads in the cluster', | 
|  | 1149       'ecount' = "number of edges of the graph:", | 
|  | 1150       "annotations_summary" = "similarity based annotation" | 
|  | 1151     ) | 
|  | 1152     values = unlist(x[include]) %>% gsub("\n","<br>", .) | 
|  | 1153     include = replace(include, match(names(new_names),include), new_names) | 
|  | 1154     names(values) = include | 
|  | 1155     ## replace tandem rank with annotation description | 
|  | 1156     values["TAREAN_annotation"] = RANKS_TANDEM[values["TAREAN_annotation"]] | 
|  | 1157     ## create HTML table without header | 
|  | 1158     desc = df2html(data.frame(names(values), values)) | 
|  | 1159     return(desc) | 
|  | 1160 } | 
|  | 1161 | 
|  | 1162 create_cluster_report = function(index, | 
|  | 1163                                  seqdb, hitsortdb, | 
|  | 1164                                  classification_hierarchy_file, | 
|  | 1165                                  HTML_LINKS){ | 
|  | 1166     ## index - index of cluster, integer | 
|  | 1167     ## seqdb, hitsortdb - sqlite databases | 
|  | 1168     ## classification_hierarchy_file - rds file with classification | 
|  | 1169     ## other information about cluster is collected from HITSORTDB | 
|  | 1170     connect_to_databases(seqdb, hitsortdb, classification_hierarchy_file) | 
|  | 1171     ## basic info about cluster | 
|  | 1172     clinfo = get_cluster_info(index) | 
|  | 1173     HTML_LINKS = nested2named_list(HTML_LINKS) | 
|  | 1174     PNGWIDTH = 1000  # this must be kept so than link in png work correctly | 
|  | 1175     PNGHEIGHT = 1000 | 
|  | 1176     LEGENDWIDTH = 300 | 
|  | 1177     PS = 30  ## pointsize for plotting | 
|  | 1178     MAX_N = 10 ## to show mates or similarities connections | 
|  | 1179 | 
|  | 1180     ################################################################################# | 
|  | 1181     ## create HTML title: | 
|  | 1182     title = paste("Cluster no. ",index) | 
|  | 1183     htmlheader = gsub("PAGE_TITLE", title, HTMLHEADER) | 
|  | 1184     cat(htmlheader, file = clinfo$html_report_main) | 
|  | 1185     file.copy(paste0(WD,"/style1.css"), clinfo$dir) | 
|  | 1186     HTML.title(title, file = clinfo$html_report_main) | 
|  | 1187     ## print basic info to HTML header: | 
|  | 1188     ## make link back to cluster table | 
|  | 1189     hwrite("Go back to cluster table <br><br>\n", link=HTML_LINKS$CLUSTER_TO_CLUSTER_TABLE) %>% | 
|  | 1190         cat(file = clinfo$html_report_main, append =TRUE) | 
|  | 1191     ## create ling to corresponding supercluster | 
|  | 1192     cat("Cluster is part of ", | 
|  | 1193         hwrite (paste("supercluster: ", clinfo$supercluster), | 
|  | 1194                 link=sprintf(HTML_LINKS$CLUSTER_TO_SUPERCLUSTER, clinfo$supercluster)), | 
|  | 1195         "\n<br><br>\n", | 
|  | 1196         file = clinfo$html_report_main, append =TRUE) | 
|  | 1197     HTML.title("Cluster characteristics:", HR = 3, file = clinfo$html_report_main) | 
|  | 1198     cat(format_clinfo(clinfo), file = clinfo$html_report_main, append =TRUE) | 
|  | 1199     ################################################################################# | 
|  | 1200     ## add link to image with graph layout with domains | 
|  | 1201     fs = list( | 
|  | 1202         graph_domains = paste0(clinfo$html_report_files,"/graph_domains.png"), | 
|  | 1203         graph_mates = paste0(clinfo$html_report_files,"/graph_mates%04d_%04d.png"), | 
|  | 1204         graph_base = paste0(clinfo$html_report_files,"/graph_base.png"), | 
|  | 1205         graph_comparative = paste0(clinfo$html_report_files,"/graph_comparative.png") | 
|  | 1206     ) | 
|  | 1207     fs_relative = list( | 
|  | 1208         graph_domains = paste0(basename(clinfo$html_report_files),"/graph_domains.png"), | 
|  | 1209         graph_comparative = paste0(basename(clinfo$html_report_files),"/graph_comparative.png"), | 
|  | 1210         graph_mates = paste0(basename(clinfo$html_report_files),"/graph_mates%04d_%04d.png") | 
|  | 1211     ) | 
|  | 1212 | 
|  | 1213     dir.create(clinfo$html_report_files) | 
|  | 1214     load(clinfo$graph_file) | 
|  | 1215     annot = cluster_annotation(index) | 
|  | 1216     annot_summary = summarize_annotation(annot, clinfo$size_real) | 
|  | 1217     vertex_colors = annot2colors(annot,ids = V(GL$G)$name) | 
|  | 1218 | 
|  | 1219     color_proportions = sort(table(vertex_colors$color_table)) / clinfo$size_real | 
|  | 1220     ## hits with smaller proportion are not shown! | 
|  | 1221     exclude_colors = names(color_proportions)[color_proportions < 0.001] | 
|  | 1222     vertex_colors$color_table[vertex_colors$color_table %in% exclude_colors] = "#00000050" | 
|  | 1223     vertex_colors$legend = vertex_colors$legend[!vertex_colors$legend %in% exclude_colors] | 
|  | 1224 | 
|  | 1225     if (is_comparative()){ | 
|  | 1226         comp_codes = get_comparative_codes()$prefix | 
|  | 1227         colors = unique_colors[seq_along(comp_codes)] | 
|  | 1228         names(colors) = comp_codes | 
|  | 1229         species = substring(V(GL$G)$name,1, nchar(comp_codes[1])) | 
|  | 1230         ## create image with highlighted domains | 
|  | 1231         png(fs$graph_comparative, width = PNGWIDTH + LEGENDWIDTH, height = PNGHEIGHT, pointsize = PS) | 
|  | 1232         layout(matrix(1:2, ncol = 2), width = c(10, 3)) | 
|  | 1233         plotg(GL$G,GL$L, col = colors[species]) | 
|  | 1234         par(mar = c(0, 0, 0, 0)) | 
|  | 1235         plot.new() | 
|  | 1236         legend("topleft", col=colors, | 
|  | 1237                    legend = names(colors), | 
|  | 1238                    pch = 15, cex = 0.7) | 
|  | 1239         dev.off() | 
|  | 1240         HTML.title("comparative analysis:", HR=4, file = clinfo$html_report_main) | 
|  | 1241         html_insert_image( | 
|  | 1242             img_file = fs_relative$graph_comparative, | 
|  | 1243             htmlfile = clinfo$html_report_main) | 
|  | 1244         ## comparative analysis - calculate statistics: | 
|  | 1245         V1V2 = substring(get.edgelist(GL$G),1, nchar(comp_codes[1])) | 
|  | 1246         C_observed = table(data.frame( | 
|  | 1247             V1=factor(V1V2[,1],levels = comp_codes), | 
|  | 1248             V2=factor(V1V2[,2],levels = comp_codes) | 
|  | 1249         )) | 
|  | 1250         C_observed = as.data.frame.matrix(C_observed + t(C_observed)) | 
|  | 1251         P_observed = as.data.frame.matrix(C_observed/sum(C_observed)) | 
|  | 1252         Edge_propotions = table(factor(V1V2, levels = comp_codes))/(length(V1V2)) | 
|  | 1253         P_expected = matrix(Edge_propotions,ncol=1) %*% matrix(Edge_propotions, nrow=1) | 
|  | 1254 | 
|  | 1255         spec_counts = df2html( | 
|  | 1256             data.frame(table(factor(species, levels=comp_codes))), | 
|  | 1257             header =  c("Species", "Read count") | 
|  | 1258         ) | 
|  | 1259 | 
|  | 1260         edge_count = df2html( | 
|  | 1261             data.frame(species = comp_codes, (C_observed/2)[,]), | 
|  | 1262             header = c("", comp_codes) | 
|  | 1263         ) | 
|  | 1264 | 
|  | 1265         P_OE = df2html( | 
|  | 1266             data.frame(species=comp_codes,(P_observed/P_expected[,])), | 
|  | 1267             header = c("", comp_codes) | 
|  | 1268         ) | 
|  | 1269 | 
|  | 1270         HTML.title("Comparative analysis - species read counts:", HR =5, file = clinfo$html_report_main) | 
|  | 1271         cat(spec_counts, | 
|  | 1272             file = clinfo$html_report_main, append =TRUE) | 
|  | 1273 | 
|  | 1274         # show connection only if more than one species in cluster | 
|  | 1275         if (length(unique(species))>1){ | 
|  | 1276             HTML.title("comparative analysis - number of edges between species:", | 
|  | 1277                        HR =5, file = clinfo$html_report_main) | 
|  | 1278             cat(edge_count, | 
|  | 1279                 file = clinfo$html_report_main, append = TRUE) | 
|  | 1280 | 
|  | 1281             HTML.title("comparative analysis - observed/expected number of edges between species", | 
|  | 1282                        HR =5, file = clinfo$html_report_main) | 
|  | 1283             cat(P_OE, | 
|  | 1284                 file = clinfo$html_report_main, append = TRUE) | 
|  | 1285         } | 
|  | 1286 | 
|  | 1287     } | 
|  | 1288 | 
|  | 1289 | 
|  | 1290     ## create image with highlighted domains | 
|  | 1291     png(fs$graph_domains, width = PNGWIDTH + LEGENDWIDTH, height = PNGHEIGHT, pointsize = PS) | 
|  | 1292     layout(matrix(1:2, ncol = 2), width = c(10, 3)) | 
|  | 1293     plotg(GL$G,GL$L, col = vertex_colors$color_table) | 
|  | 1294     domains_detected = length(vertex_colors$legend) > 0 | 
|  | 1295     par(mar = c(0, 0, 0, 0)) | 
|  | 1296     plot.new() | 
|  | 1297     if (domains_detected){ | 
|  | 1298         # domains found | 
|  | 1299         legend("topleft", col=vertex_colors$legend, | 
|  | 1300                legend = names(vertex_colors$legend), | 
|  | 1301                pch = 15, cex = 0.7) | 
|  | 1302     } | 
|  | 1303     dev.off() | 
|  | 1304 | 
|  | 1305     HTML.title("protein domains:", HR=4, file = clinfo$html_report_main) | 
|  | 1306     if (!domains_detected){ | 
|  | 1307         HTML("No protein domains detected", file = clinfo$html_report_main) | 
|  | 1308     } | 
|  | 1309     HTML("protein domains:", HR=4, file = clinfo$html_report_main) | 
|  | 1310     html_insert_image( | 
|  | 1311         img_file = fs_relative$graph_domains, | 
|  | 1312         htmlfile = clinfo$html_report_main) | 
|  | 1313 | 
|  | 1314     ############################################################################# | 
|  | 1315     if (nrow(annot_summary) == 0){ | 
|  | 1316     HTML.title("Reads annotation summary", HR = 3, file = clinfo$html_report_main) | 
|  | 1317         HTML("No similarity hits to repeat databases found", file = clinfo$html_report_main) | 
|  | 1318     }else{ | 
|  | 1319         HTML(annot_summary, file = clinfo$html_report_main, align = "left") | 
|  | 1320     } | 
|  | 1321 | 
|  | 1322 | 
|  | 1323     ## similarity and mate cluster | 
|  | 1324     mate_clusters = get_cluster_connection_info(index, search_type="pair") | 
|  | 1325     similar_clusters = get_cluster_connection_info(index, search_type="similarity") | 
|  | 1326     ## report mate and similarity clusters | 
|  | 1327     if (!is.null(similar_clusters)){ | 
|  | 1328         HTML.title("clusters with similarity:", file =clinfo$html_report_main, HR = 3) | 
|  | 1329         cat(df2html( | 
|  | 1330             similar_clusters, | 
|  | 1331             header=c("Cluster","Number of similarity hits"), | 
|  | 1332             sort_col = "N", scroling = TRUE | 
|  | 1333             ), | 
|  | 1334         file =clinfo$html_report_main, append=TRUE) | 
|  | 1335     } | 
|  | 1336     if (!is.null(mate_clusters)){ | 
|  | 1337         HTML.title("clusters connected through mates:", file =clinfo$html_report_main, HR = 3) | 
|  | 1338         cat(df2html( | 
|  | 1339             mate_clusters[,c('cl','N','k')], | 
|  | 1340             header = c('Cluster','Number of shared<br> read pairs','k'), | 
|  | 1341             sort_col = "N", scroling = TRUE | 
|  | 1342             ), | 
|  | 1343         file = clinfo$html_report_main,append = TRUE | 
|  | 1344         ) | 
|  | 1345 | 
|  | 1346         ## create base graph images - it will serve as background for | 
|  | 1347         ## mate clusters plots | 
|  | 1348         png(fs$graph_base, | 
|  | 1349             width = PNGWIDTH, height = PNGHEIGHT, pointsize = PS) | 
|  | 1350         par(mar=c(0,0,0,0),xaxs="i",yaxs="i") | 
|  | 1351         plotg(GL$G,GL$L, col = "#00000050") | 
|  | 1352         dev.off() | 
|  | 1353         ## load base as raster image | 
|  | 1354         base_image = readPNG(fs$graph_base) | 
|  | 1355 | 
|  | 1356         for (i in order(mate_clusters$N, decreasing = TRUE)){ | 
|  | 1357             mate_ids = unlist(strsplit(mate_clusters$ids[[i]],split=",")) | 
|  | 1358             ## print only graph above MAX_N mates | 
|  | 1359             if (length(mate_ids) < MAX_N){  # TODO  - use constant | 
|  | 1360                 next | 
|  | 1361             } | 
|  | 1362             png(sprintf(fs$graph_mates,index, mate_clusters$cl[i]), | 
|  | 1363                 width = PNGWIDTH, height = PNGHEIGHT, pointsize = PS) | 
|  | 1364             color_mate = gsub(".$","",V(GL$G)$name) %in% mate_ids %>% | 
|  | 1365                 ifelse("#FF0000FF", "#000000AA") | 
|  | 1366             par(mar=c(0,0,0,0),xaxs="i",yaxs="i") | 
|  | 1367             plot(range(GL$L[,1]), range(GL$L[,2]), type = "n", xlab = "", ylab = "", axes = FALSE, | 
|  | 1368                  main = paste0("CL",index," ----> CL",mate_clusters$cl[i])) | 
|  | 1369             rasterImage(base_image, | 
|  | 1370                         range(GL$L[,1])[1], range(GL$L[,2])[1], | 
|  | 1371                         range(GL$L[,1])[2], range(GL$L[,2])[2] | 
|  | 1372                         ) | 
|  | 1373             points(GL$L[,1:2], col = color_mate,pch=18,cex=.8) | 
|  | 1374             dev.off() | 
|  | 1375             title = paste0("CL",index," ----> CL",mate_clusters$cl[i]) | 
|  | 1376             footer = paste0("No. of shared pairs: :", mate_clusters$N[i]) | 
|  | 1377             html_insert_floating_image( | 
|  | 1378                 img_file = sprintf(fs_relative$graph_mates, index, mate_clusters$cl[i]), | 
|  | 1379                 htmlfile = clinfo$html_report_main, width = 200, | 
|  | 1380                 title = title, footer = footer | 
|  | 1381             ) | 
|  | 1382         } | 
|  | 1383     } | 
|  | 1384 } |