Mercurial > repos > petrn > repeat_annotation_pipeline
comparison clean_rm_output.R @ 0:d14182506989 draft default tip
"planemo upload commit d7966a292ed4209f4058e77ab8c0e49a67847b16-dirty"
| author | petrn |
|---|---|
| date | Tue, 15 Feb 2022 16:44:31 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:d14182506989 |
|---|---|
| 1 #!/usr/bin/env Rscript | |
| 2 suppressPackageStartupMessages(library(rtracklayer)) | |
| 3 suppressPackageStartupMessages(library(parallel)) | |
| 4 | |
| 5 | |
| 6 gff_cleanup = function(gff){ | |
| 7 ## remove overlapin annotation track - assign new annot | |
| 8 gff_disjoin = disjoin(gff, with.revmap=TRUE) | |
| 9 ## append annotation: | |
| 10 gff_names = mclapply(as.list(gff_disjoin$revmap), FUN = function(x)gff$Name[x], mc.cores = 8) | |
| 11 gff_strands = mclapply(as.list(gff_disjoin$revmap), FUN = function(x)strand(gff[x]), mc.cores = 8) | |
| 12 new_annot = sapply(sapply(gff_names, unique), paste, collapse="|") | |
| 13 new_annot_uniq = unique(new_annot) | |
| 14 lca_annot = sapply(strsplit(new_annot_uniq, "|", fixed = TRUE), resolve_name) | |
| 15 names(lca_annot) = new_annot_uniq | |
| 16 new_annot_lca = lca_annot[new_annot] | |
| 17 #new_annot_lca = sapply(sapply(gff_names, unique), resolve_name) | |
| 18 strand_attribute = sapply(sapply(gff_strands, unique), paste, collapse="|") | |
| 19 gff_disjoin$strands=strand_attribute | |
| 20 gff_disjoin$source="RM" | |
| 21 gff_disjoin$type="repeat" | |
| 22 gff_disjoin$score=NA | |
| 23 gff_disjoin$phase=NA | |
| 24 gff_disjoin$Name=new_annot_lca | |
| 25 gff_disjoin$Original_names=new_annot | |
| 26 gff_disjoin$revmap=NULL | |
| 27 return(gff_disjoin) | |
| 28 } | |
| 29 | |
| 30 resolve_name=function(x){ | |
| 31 if (length(x)==1){ | |
| 32 # no conflict | |
| 33 return(x) | |
| 34 } else{ | |
| 35 y = sapply(x, strsplit, split="/", fixed = TRUE) | |
| 36 ny = table(unlist(sapply(y, function(x)paste(seq_along(x),x)))) | |
| 37 if (max(ny)<length(x)){ | |
| 38 return("Unknown") | |
| 39 }else{ | |
| 40 k = which(ny==length(x)) | |
| 41 r = max(as.numeric((gsub(" .+","",names(k))))) | |
| 42 out = paste(y[[1]][1:r], collapse="/") | |
| 43 return(out) | |
| 44 } | |
| 45 } | |
| 46 } | |
| 47 | |
| 48 convert_names <- function(n, old_sep = "|" , new_sep = "\""){ | |
| 49 # remove all characters which are new_sep with - | |
| 50 n_new = gsub(old_sep, new_sep, | |
| 51 gsub(new_sep,"-", n, fixed = TRUE), | |
| 52 fixed = TRUE) | |
| 53 return(n_new) | |
| 54 } | |
| 55 | |
| 56 | |
| 57 infile = commandArgs(T)[1] | |
| 58 outfile = commandArgs(T)[2] | |
| 59 | |
| 60 | |
| 61 ## infile = "./test_data/raw_rm.out" | |
| 62 | |
| 63 rm_out = read.table(infile, as.is=TRUE, sep="", skip = 2, fill=TRUE, header=FALSE, col.names=paste0("V",1:16)) | |
| 64 | |
| 65 gff = GRanges(seqnames = rm_out$V5, ranges = IRanges(start = rm_out$V6, end=rm_out$V7)) | |
| 66 | |
| 67 # repeat class after # symbol - syntax 1 | |
| 68 # detect separator | |
| 69 # if "|" is present replace "|" -> "/" and "/" -> "-" | |
| 70 if (any(grep("|", rm_out$V11))){ | |
| 71 gff$Name <- convert_names(rm_out$V11, old_sep = "|", new_sep = "/") | |
| 72 message('replacing classification separator character "|" with "/"') | |
| 73 print(gff) | |
| 74 }else{ | |
| 75 gff$Name <- rm_out$V11 | |
| 76 } | |
| 77 | |
| 78 ## is repeat type is specifies by double underscore: | |
| 79 ## then rm_out$V11 is unspecified | |
| 80 if (any(rm_out$V11 == "Unspecified")){ | |
| 81 ## set Name from prefix | |
| 82 ## TODO | |
| 83 inc = rm_out$V11 == "Unspecified" | |
| 84 Name = gsub("__.+","",rm_out$V10) | |
| 85 # chanche Usnpsecified to new name | |
| 86 gff$Name[inc] = Name | |
| 87 } | |
| 88 | |
| 89 | |
| 90 ## join neighbors with the same annotation, disregard strand! | |
| 91 result <- unlist(reduce(split(gff, gff$Name))) | |
| 92 | |
| 93 result$Name <- names(result) | |
| 94 | |
| 95 result_clean = gff_cleanup(result) | |
| 96 | |
| 97 ## TODO | |
| 98 ## identify conflicting annotation, replace by LCA but keep origin list of classifications | |
| 99 | |
| 100 gff_out = sortSeqlevels(result_clean) | |
| 101 gff_out = sort(gff_out) | |
| 102 gff_out$type = "repeat_region" | |
| 103 gff_out$source = "RepeatMasker_parsed" | |
| 104 gff_out$ID=paste0(gff_out$Name, "_", seq_along(gff_out$Name)) | |
| 105 export(gff_out, format = "gff3", con=outfile) | |
| 106 | |
| 107 |
