annotate kegg_identification.R @ 6:042254fb1f8d draft default tip

"planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
author proteore
date Mon, 17 May 2021 12:19:30 +0000
parents dc39f12f96d1
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
6
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
1 options(warn = -1) #TURN OFF WARNINGS !!!!!!
3
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
2
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
3 suppressMessages(library(KEGGREST))
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
4
6
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
5 get_args <- function() {
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
6
3
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
7 ## Collect arguments
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
8 args <- commandArgs(TRUE)
6
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
9
3
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
10 ## Default setting when no arguments passed
6
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
11 if (length(args) < 1) {
3
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
12 args <- c("--help")
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
13 }
6
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
14
3
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
15 ## Help section
6
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
16 if ("--help" %in% args) {
3
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
17 cat("Pathview R script
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
18 Arguments:
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
19 --help Print this test
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
20 --input tab file
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
21 --id_list id list ',' separated
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
22 --id_type type of input ids (kegg-id, uniprot_AC,geneID)
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
23 --id_column number og column containg ids of interest
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
24 --nb_pathways number of pathways to return
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
25 --header boolean
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
26 --output output path
6
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
27 --species species used to get specific pathways(hsa,mmu,rno)
3
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
28
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
29 Example:
6
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
30 Rscript keggrest.R --input='P31946,P62258' --id_type='uniprot'
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
31 --id_column 'c1' --header TRUE \n\n")
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
32
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
33 q(save = "no")
3
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
34 }
6
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
35
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
36 parseargs <- function(x) strsplit(sub("^--", "", x), "=")
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
37 argsdf <- as.data.frame(do.call("rbind", parseargs(args)))
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
38 args <- as.list(as.character(argsdf$V2))
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
39 names(args) <- argsdf$V1
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
40
3
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
41 return(args)
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
42 }
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
43
6
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
44 str2bool <- function(x) {
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
45 if (any(is.element(c("t", "true"), tolower(x)))) {
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
46 return(TRUE)
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
47 }else if (any(is.element(c("f", "false"), tolower(x)))) {
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
48 return(FALSE)
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
49 }else {
3
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
50 return(NULL)
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
51 }
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
52 }
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
53
6
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
54 read_file <- function(path, header) {
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
55 file <- try(read.csv(path, header = header, sep = "\t",
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
56 stringsAsFactors = FALSE, quote = "\"", check.names = F), silent = TRUE)
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
57 if (inherits(file, "try-error")) {
3
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
58 stop("File not found !")
6
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
59 }else {
3
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
60 return(file)
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
61 }
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
62 }
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
63
6
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
64 get_pathways_list <- function(species) {
3
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
65 ##all available pathways for the species
6
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
66 pathways <- keggLink("pathway", species)
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
67 tot_path <- unique(pathways)
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
68
3
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
69 ##formating the dat into a list object
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
70 ##key= pathway ID, value = genes of the pathway in the kegg format
6
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
71 pathways_list <- sapply(tot_path, function(pathway)
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
72 names(which(pathways == pathway)))
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
73 return(pathways_list)
3
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
74 }
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
75
6
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
76 get_list_from_cp <- function(list) {
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
77 list <- strsplit(list, "[ \t\n]+")[[1]]
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
78 list <- gsub("[[:blank:]]|\u00A0|NA", "", list)
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
79 list <- list[which(!is.na(list[list != ""]))] #remove empty entry
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
80 list <- unique(gsub("-.+", "", list))
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
81 #Remove isoform accession number (e.g. "-2")
3
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
82 return(list)
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
83 }
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
84
6
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
85 geneid_to_kegg <- function(vector, species) {
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
86 vector <- sapply(vector, function(x) paste(species, x, sep = ":"),
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
87 USE.NAMES = F)
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
88 return(vector)
3
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
89 }
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
90
6
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
91 to_keggid <- function(id_list, id_type) {
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
92 if (id_type == "ncbi-geneid") {
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
93 id_list <- unique(geneid_to_kegg(id_list, args$species))
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
94 }else if (id_type == "uniprot") {
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
95 id_list <- unique(sapply(id_list, function(x)
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
96 paste(id_type, ":", x, sep = ""), USE.NAMES = F))
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
97 if (length(id_list) > 250) {
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
98 id_list <- split(id_list, ceiling(seq_along(id_list) / 250))
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
99 id_list <- sapply(id_list, function(x) keggConv("genes", x))
3
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
100 id_list <- unique(unlist(id_list))
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
101 } else {
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
102 id_list <- unique(keggConv("genes", id_list))
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
103 }
6
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
104 } else if (id_type == "kegg-id") {
3
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
105 id_list <- unique(id_list)
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
106 }
6
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
107 return(id_list)
3
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
108 }
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
109
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
110 #take data frame, return data frame
6
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
111 split_ids_per_line <- function(line, ncol) {
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
112
3
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
113 #print (line)
6
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
114 header <- colnames(line)
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
115 line[ncol] <- gsub("[[:blank:]]|\u00A0", "", line[ncol])
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
116
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
117 if (length(unlist(strsplit(as.character(line[ncol]), ";"))) > 1) {
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
118 if (length(line) == 1) {
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
119 lines <- as.data.frame(unlist(strsplit(
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
120 as.character(line[ncol]), ";")), stringsAsFactors = F)
3
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
121 } else {
6
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
122 if (ncol == 1) { #first column
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
123 lines <- suppressWarnings(cbind(unlist(strsplit(
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
124 as.character(line[ncol]), ";")), line[2:length(line)]))
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
125 } else if (ncol == length(line)) { #last column
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
126 lines <- suppressWarnings(cbind(line[1:ncol - 1],
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
127 unlist(strsplit(as.character(line[ncol]), ";"))))
3
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
128 } else {
6
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
129 lines <- suppressWarnings(cbind(line[1:ncol - 1],
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
130 unlist(strsplit(as.character(line[ncol]), ";"), use.names = F),
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
131 line[(ncol + 1):length(line)]))
3
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
132 }
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
133 }
6
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
134 colnames(lines) <- header
3
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
135 return(lines)
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
136 } else {
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
137 return(line)
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
138 }
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
139 }
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
140
6
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
141 #create new lines if there's more than one id per cell in the columns in order
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
142 #to have only one id per line
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
143 one_id_one_line <- function(tab, ncol) {
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
144
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
145 if (ncol(tab) > 1) {
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
146
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
147 tab[, ncol] <- sapply(tab[, ncol], function(x) gsub("[[:blank:]]", "", x))
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
148 header <- colnames(tab)
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
149 res <- as.data.frame(matrix(ncol = ncol(tab), nrow = 0))
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
150 for (i in seq_len(nrow(tab))) {
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
151 lines <- split_ids_per_line(tab[i, ], ncol)
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
152 res <- rbind(res, lines)
3
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
153 }
6
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
154 } else {
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
155 res <- unlist(sapply(tab[, 1], function(x) strsplit(x, ";")), use.names = F)
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
156 res <- data.frame(res[which(!is.na(res[res != ""]))], stringsAsFactors = F)
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
157 colnames(res) <- colnames(tab)
3
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
158 }
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
159 return(res)
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
160 }
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
161
6
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
162 kegg_mapping <- function(kegg_id_list, id_type, ref_ids) {
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
163
3
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
164 #mapping
6
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
165 map <- lapply(ref_ids, is.element, unique(kegg_id_list))
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
166 names(map) <- sapply(names(map), function(x) gsub("path:", "", x),
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
167 USE.NAMES = FALSE) #remove the prefix "path:"
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
168
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
169 in_path <- sapply(map, function(x) length(which(x == TRUE)))
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
170 tot_path <- sapply(map, length)
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
171
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
172 ratio <- (as.numeric(in_path[which(in_path != 0)])) /
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
173 (as.numeric(tot_path[which(in_path != 0)]))
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
174 ratio <- as.numeric(format(round(ratio * 100, 2), nsmall = 2))
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
175
3
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
176 ##useful but LONG
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
177 ## to do before : in step 1
6
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
178 path_names <- names(in_path[which(in_path != 0)])
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
179 name <- sapply(path_names, function(x) keggGet(x)[[1]]$NAME,
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
180 USE.NAMES = FALSE)
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
181
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
182 res <- data.frame(I(names(in_path[which(in_path != 0)])), I(name), ratio,
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
183 as.numeric(in_path[which(in_path != 0)]),
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
184 as.numeric(tot_path[which(in_path != 0)]))
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
185 res <- res[order(as.numeric(res[, 3]), decreasing = TRUE), ]
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
186 colnames(res) <- c("pathway_ID", "Description",
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
187 "Ratio IDs mapped / total IDs (%)",
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
188 "nb KEGG genes IDs mapped in the pathway",
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
189 "nb total of KEGG genes IDs present in the pathway")
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
190
3
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
191 return(res)
6
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
192
3
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
193 }
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
194
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
195 #get args from command line
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
196 args <- get_args()
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
197
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
198 ###setting variables
6
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
199 header <- str2bool(args$header)
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
200 if (!is.null(args$id_list)) {
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
201 id_list <- get_list_from_cp(args$id_list)
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
202 } #get ids from copy/paste input
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
203 if (!is.null(args$input)) { #get ids from input file
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
204 csv <- read_file(args$input, header)
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
205 ncol <- as.numeric(gsub("c", "", args$id_column))
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
206 csv <- one_id_one_line(csv, ncol)
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
207 id_list <- as.vector(csv[, ncol])
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
208 id_list <- unique(id_list[which(!is.na(id_list[id_list != ""]))])
3
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
209 }
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
210
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
211 #convert to keggID if needed
6
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
212 id_list <- to_keggid(id_list, args$id_type)
3
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
213
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
214 #get pathways of species with associated KEGG ID genes
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
215 pathways_list <- get_pathways_list(args$species)
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
216
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
217 #mapping on pathways
6
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
218 res <- kegg_mapping(id_list, args$id_type, pathways_list)
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
219 if (nrow(res) > as.numeric(args$nb_pathways)) {
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
220 res <- res[1:args$nb_pathways, ]
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
221 }
3
e7b3609160c5 planemo upload commit 6854718828a7478905b41fb92d6f96ab41896e84-dirty
proteore
parents:
diff changeset
222
6
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
223 write.table(res, file = args$output, quote = FALSE, sep = "\t",
042254fb1f8d "planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
proteore
parents: 4
diff changeset
224 row.names = FALSE, col.names = TRUE)