Mercurial > repos > petrn > repeat_annotation_pipeline
annotate clean_rm_output.R @ 0:d14182506989 draft default tip
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
author | petrn |
---|---|
date | Tue, 15 Feb 2022 16:44:31 +0000 |
parents | |
children |
rev | line source |
---|---|
0
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
1 #!/usr/bin/env Rscript |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
2 suppressPackageStartupMessages(library(rtracklayer)) |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
3 suppressPackageStartupMessages(library(parallel)) |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
4 |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
5 |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
6 gff_cleanup = function(gff){ |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
7 ## remove overlapin annotation track - assign new annot |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
8 gff_disjoin = disjoin(gff, with.revmap=TRUE) |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
9 ## append annotation: |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
10 gff_names = mclapply(as.list(gff_disjoin$revmap), FUN = function(x)gff$Name[x], mc.cores = 8) |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
11 gff_strands = mclapply(as.list(gff_disjoin$revmap), FUN = function(x)strand(gff[x]), mc.cores = 8) |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
12 new_annot = sapply(sapply(gff_names, unique), paste, collapse="|") |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
13 new_annot_uniq = unique(new_annot) |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
14 lca_annot = sapply(strsplit(new_annot_uniq, "|", fixed = TRUE), resolve_name) |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
15 names(lca_annot) = new_annot_uniq |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
16 new_annot_lca = lca_annot[new_annot] |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
17 #new_annot_lca = sapply(sapply(gff_names, unique), resolve_name) |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
18 strand_attribute = sapply(sapply(gff_strands, unique), paste, collapse="|") |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
19 gff_disjoin$strands=strand_attribute |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
20 gff_disjoin$source="RM" |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
21 gff_disjoin$type="repeat" |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
22 gff_disjoin$score=NA |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
23 gff_disjoin$phase=NA |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
24 gff_disjoin$Name=new_annot_lca |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
25 gff_disjoin$Original_names=new_annot |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
26 gff_disjoin$revmap=NULL |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
27 return(gff_disjoin) |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
28 } |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
29 |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
30 resolve_name=function(x){ |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
31 if (length(x)==1){ |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
32 # no conflict |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
33 return(x) |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
34 } else{ |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
35 y = sapply(x, strsplit, split="/", fixed = TRUE) |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
36 ny = table(unlist(sapply(y, function(x)paste(seq_along(x),x)))) |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
37 if (max(ny)<length(x)){ |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
38 return("Unknown") |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
39 }else{ |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
40 k = which(ny==length(x)) |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
41 r = max(as.numeric((gsub(" .+","",names(k))))) |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
42 out = paste(y[[1]][1:r], collapse="/") |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
43 return(out) |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
44 } |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
45 } |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
46 } |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
47 |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
48 convert_names <- function(n, old_sep = "|" , new_sep = "\""){ |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
49 # remove all characters which are new_sep with - |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
50 n_new = gsub(old_sep, new_sep, |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
51 gsub(new_sep,"-", n, fixed = TRUE), |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
52 fixed = TRUE) |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
53 return(n_new) |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
54 } |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
55 |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
56 |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
57 infile = commandArgs(T)[1] |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
58 outfile = commandArgs(T)[2] |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
59 |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
60 |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
61 ## infile = "./test_data/raw_rm.out" |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
62 |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
63 rm_out = read.table(infile, as.is=TRUE, sep="", skip = 2, fill=TRUE, header=FALSE, col.names=paste0("V",1:16)) |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
64 |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
65 gff = GRanges(seqnames = rm_out$V5, ranges = IRanges(start = rm_out$V6, end=rm_out$V7)) |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
66 |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
67 # repeat class after # symbol - syntax 1 |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
68 # detect separator |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
69 # if "|" is present replace "|" -> "/" and "/" -> "-" |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
70 if (any(grep("|", rm_out$V11))){ |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
71 gff$Name <- convert_names(rm_out$V11, old_sep = "|", new_sep = "/") |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
72 message('replacing classification separator character "|" with "/"') |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
73 print(gff) |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
74 }else{ |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
75 gff$Name <- rm_out$V11 |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
76 } |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
77 |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
78 ## is repeat type is specifies by double underscore: |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
79 ## then rm_out$V11 is unspecified |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
80 if (any(rm_out$V11 == "Unspecified")){ |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
81 ## set Name from prefix |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
82 ## TODO |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
83 inc = rm_out$V11 == "Unspecified" |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
84 Name = gsub("__.+","",rm_out$V10) |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
85 # chanche Usnpsecified to new name |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
86 gff$Name[inc] = Name |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
87 } |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
88 |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
89 |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
90 ## join neighbors with the same annotation, disregard strand! |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
91 result <- unlist(reduce(split(gff, gff$Name))) |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
92 |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
93 result$Name <- names(result) |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
94 |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
95 result_clean = gff_cleanup(result) |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
96 |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
97 ## TODO |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
98 ## identify conflicting annotation, replace by LCA but keep origin list of classifications |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
99 |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
100 gff_out = sortSeqlevels(result_clean) |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
101 gff_out = sort(gff_out) |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
102 gff_out$type = "repeat_region" |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
103 gff_out$source = "RepeatMasker_parsed" |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
104 gff_out$ID=paste0(gff_out$Name, "_", seq_along(gff_out$Name)) |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
105 export(gff_out, format = "gff3", con=outfile) |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
106 |
d14182506989
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
petrn
parents:
diff
changeset
|
107 |