Mercurial > repos > petrn > repeat_annotation_pipeline
annotate annotate_contigs.R @ 0:d14182506989 draft default tip
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
author | petrn |
---|---|
date | Tue, 15 Feb 2022 16:44:31 +0000 |
parents | |
children |
rev | line source |
---|---|
0
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
1 #!/usr/bin/env Rscript |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
2 ## input 1 - fasta with CLXContig names |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
3 ## input 2 - annotation |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
4 ## output 3 - annotated fasta |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
5 suppressPackageStartupMessages(library(Biostrings)) |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
6 |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
7 clean_contigs = function(s) { |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
8 ## remove all N |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
9 sc = as.character(s) |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
10 sc_trimmed = gsub("N+$", "", gsub("^N+", "", s)) |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
11 ## remove blank and short sequences: |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
12 sc_trimmed_not_empty = sc_trimmed[nchar(sc_trimmed) != 0] |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
13 sc_trimmed_short = sc_trimmed_not_empty[nchar(sc_trimmed_not_empty) <= 20] |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
14 sc_trimmed_long = sc_trimmed_not_empty[nchar(sc_trimmed_not_empty) > 20] |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
15 sc_trimmed_short_tarean = sc_trimmed_short[grep("sc_", names(sc_trimmed_short), fixed = TRUE)] |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
16 sc_out = DNAStringSet(c(sc_trimmed_long, sc_trimmed_short_tarean)) |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
17 } |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
18 |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
19 ## annotate_rm_fasta.R input.fasta annot.csv output.fasta |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
20 ## input 1 - input.fasta - contigs from clustering |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
21 ## input 2 - annot.csv of clusters, firts column is CL number, seciond is annotation |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
22 ## |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
23 ## output - clean conntigs with appended annotation |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
24 |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
25 ## find header row of annotation table |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
26 x = readLines(commandArgs(T)[2]) |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
27 |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
28 ## TODO - check mandatory names!!! |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
29 hl = intersect(grep("cluster", tolower(x)), grep("automatic_annotation", tolower(x))) |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
30 message("using line ", hl, " as header") |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
31 |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
32 annotation_table = read.table(commandArgs(T)[2], sep = "\t", header = TRUE, skip = hl - 1) |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
33 colnames(annotation_table) = tolower(colnames(annotation_table)) |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
34 |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
35 contigs = readDNAStringSet(commandArgs(T)[1]) |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
36 |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
37 |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
38 if ("final_annotation" %in% colnames(annotation_table) & all(!is.na(annotation_table$final_annotation))) { |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
39 annot_dict = annotation_table$final_annotation |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
40 message("using final annotation column") |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
41 }else { |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
42 message("using automatic annotation column") |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
43 annot_dict = annotation_table$automatic_annotation |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
44 } |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
45 |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
46 |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
47 names(annot_dict) = paste0("CL", annotation_table$cluster) |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
48 #print(annot_dict) |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
49 |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
50 contigs_ok = clean_contigs(contigs) |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
51 contig_name = gsub("Contig.+", "", names(contigs_ok)) |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
52 |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
53 ## keep only contigs which are in annot table |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
54 include = contig_name %in% names(annot_dict) |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
55 |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
56 contig_name_inc = contig_name[include] |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
57 contig_ok_include = contigs_ok[include] |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
58 |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
59 new_name_with_annot = paste0(names(contig_ok_include), "#", annot_dict[contig_name_inc]) |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
60 names(contig_ok_include) = new_name_with_annot |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
61 |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
62 writeXStringSet(contig_ok_include, filepath = commandArgs(T)[3]) |