annotate clean_rm_output.R @ 0:d14182506989 draft default tip

"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
author petrn
date Tue, 15 Feb 2022 16:44:31 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
1 #!/usr/bin/env Rscript
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
2 suppressPackageStartupMessages(library(rtracklayer))
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
3 suppressPackageStartupMessages(library(parallel))
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
4
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
5
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
6 gff_cleanup = function(gff){
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
7 ## remove overlapin annotation track - assign new annot
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
8 gff_disjoin = disjoin(gff, with.revmap=TRUE)
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
9 ## append annotation:
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
10 gff_names = mclapply(as.list(gff_disjoin$revmap), FUN = function(x)gff$Name[x], mc.cores = 8)
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
11 gff_strands = mclapply(as.list(gff_disjoin$revmap), FUN = function(x)strand(gff[x]), mc.cores = 8)
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
12 new_annot = sapply(sapply(gff_names, unique), paste, collapse="|")
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
13 new_annot_uniq = unique(new_annot)
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
14 lca_annot = sapply(strsplit(new_annot_uniq, "|", fixed = TRUE), resolve_name)
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
15 names(lca_annot) = new_annot_uniq
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
16 new_annot_lca = lca_annot[new_annot]
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
17 #new_annot_lca = sapply(sapply(gff_names, unique), resolve_name)
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
18 strand_attribute = sapply(sapply(gff_strands, unique), paste, collapse="|")
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
19 gff_disjoin$strands=strand_attribute
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
20 gff_disjoin$source="RM"
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
21 gff_disjoin$type="repeat"
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
22 gff_disjoin$score=NA
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
23 gff_disjoin$phase=NA
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
24 gff_disjoin$Name=new_annot_lca
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
25 gff_disjoin$Original_names=new_annot
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
26 gff_disjoin$revmap=NULL
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
27 return(gff_disjoin)
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
28 }
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
29
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
30 resolve_name=function(x){
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
31 if (length(x)==1){
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
32 # no conflict
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
33 return(x)
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
34 } else{
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
35 y = sapply(x, strsplit, split="/", fixed = TRUE)
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
36 ny = table(unlist(sapply(y, function(x)paste(seq_along(x),x))))
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
37 if (max(ny)<length(x)){
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
38 return("Unknown")
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
39 }else{
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
40 k = which(ny==length(x))
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
41 r = max(as.numeric((gsub(" .+","",names(k)))))
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
42 out = paste(y[[1]][1:r], collapse="/")
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
43 return(out)
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
44 }
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
45 }
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
46 }
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
47
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
48 convert_names <- function(n, old_sep = "|" , new_sep = "\""){
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
49 # remove all characters which are new_sep with -
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
50 n_new = gsub(old_sep, new_sep,
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
51 gsub(new_sep,"-", n, fixed = TRUE),
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
52 fixed = TRUE)
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
53 return(n_new)
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
54 }
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
55
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
56
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
57 infile = commandArgs(T)[1]
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
58 outfile = commandArgs(T)[2]
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
59
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
60
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
61 ## infile = "./test_data/raw_rm.out"
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
62
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
63 rm_out = read.table(infile, as.is=TRUE, sep="", skip = 2, fill=TRUE, header=FALSE, col.names=paste0("V",1:16))
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
64
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
65 gff = GRanges(seqnames = rm_out$V5, ranges = IRanges(start = rm_out$V6, end=rm_out$V7))
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
66
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
67 # repeat class after # symbol - syntax 1
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
68 # detect separator
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
69 # if "|" is present replace "|" -> "/" and "/" -> "-"
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
70 if (any(grep("|", rm_out$V11))){
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
71 gff$Name <- convert_names(rm_out$V11, old_sep = "|", new_sep = "/")
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
72 message('replacing classification separator character "|" with "/"')
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
73 print(gff)
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
74 }else{
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
75 gff$Name <- rm_out$V11
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
76 }
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
77
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
78 ## is repeat type is specifies by double underscore:
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
79 ## then rm_out$V11 is unspecified
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
80 if (any(rm_out$V11 == "Unspecified")){
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
81 ## set Name from prefix
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
82 ## TODO
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
83 inc = rm_out$V11 == "Unspecified"
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
84 Name = gsub("__.+","",rm_out$V10)
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
85 # chanche Usnpsecified to new name
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
86 gff$Name[inc] = Name
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
87 }
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
88
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
89
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
90 ## join neighbors with the same annotation, disregard strand!
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
91 result <- unlist(reduce(split(gff, gff$Name)))
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
92
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
93 result$Name <- names(result)
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
94
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
95 result_clean = gff_cleanup(result)
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
96
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
97 ## TODO
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
98 ## identify conflicting annotation, replace by LCA but keep origin list of classifications
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
99
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
100 gff_out = sortSeqlevels(result_clean)
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
101 gff_out = sort(gff_out)
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
102 gff_out$type = "repeat_region"
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
103 gff_out$source = "RepeatMasker_parsed"
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
104 gff_out$ID=paste0(gff_out$Name, "_", seq_along(gff_out$Name))
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
105 export(gff_out, format = "gff3", con=outfile)
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
106
d14182506989 "planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff changeset
107