comparison kegg_maps_visualization.R @ 0:789acaab8255 draft

planemo upload commit 78ad61e52c2bf8c5ffada89a8eed429a332eb40b-dirty
author proteore
date Thu, 06 Dec 2018 08:27:31 -0500
parents
children 8a6863adcd09
comparison
equal deleted inserted replaced
-1:000000000000 0:789acaab8255
1 #!/usr/bin/Rscript
2 #Rscript made for mapping genesID on KEGG pathway with Pathview package
3 #input : csv file containing ids (uniprot or geneID) to map, plus parameters
4 #output : KEGG pathway : jpeg or pdf file.
5
6 options(warn=-1) #TURN OFF WARNINGS !!!!!!
7 suppressMessages(library("pathview"))
8 suppressMessages(library(KEGGREST))
9
10 read_file <- function(path,header){
11 file <- try(read.csv(path,header=header, sep="\t",stringsAsFactors = FALSE, quote="\"", check.names = F, comment.char = "#"),silent=TRUE)
12 if (inherits(file,"try-error")){
13 stop("File not found !")
14 }else{
15 return(file)
16 }
17 }
18
19 ##### fuction to clean and concatenate pathway name (allow more flexibility for user input)
20 concat_string <- function(x){
21 x <- gsub(" - .*","",x)
22 x <- gsub(" ","",x)
23 x <- gsub("-","",x)
24 x <- gsub("_","",x)
25 x <- gsub(",","",x)
26 x <- gsub("\\'","",x)
27 x <- gsub("\\(.*)","",x)
28 x <- gsub("\\/","",x)
29 x <- tolower(x)
30 return(x)
31 }
32
33 #return output suffix (pathway name) from id kegg (ex : hsa:00010)
34 get_suffix <- function(pathways_list,species,id){
35 suffix = gsub("/","or",pathways_list[pathways_list[,1]==paste(species,id,sep=""),2])
36 suffix = gsub(" ","_",suffix)
37 if (nchar(suffix) > 50){
38 suffix = substr(suffix,1,50)
39 }
40 return(suffix)
41 }
42
43 str2bool <- function(x){
44 if (any(is.element(c("t","true"),tolower(x)))){
45 return (TRUE)
46 }else if (any(is.element(c("f","false"),tolower(x)))){
47 return (FALSE)
48 }else{
49 return(NULL)
50 }
51 }
52
53 is.letter <- function(x) grepl("[[:alpha:]]", x)
54
55 #### hsa00010 -> 00010
56 remove_kegg_prefix <- function(x){
57 x = gsub(":","",x)
58 if (substr(x,1,4) == 'path'){
59 x=substr(x,5,nchar(x))
60 }
61 if (is.letter(substr(x,1,3))){
62 x <- substr(x,4,nchar(x))
63 }
64 return(x)
65 }
66
67 kegg_to_geneID <- function(vector){
68 vector <- sapply(vector, function(x) unlist(strsplit(x,":"))[2],USE.NAMES = F)
69 return (vector)
70 }
71
72 clean_bad_character <- function(string) {
73 string <- gsub("X","",string)
74 return(string)
75 }
76
77 get_list_from_cp <-function(list){
78 list = strsplit(list, "[ \t\n]+")[[1]]
79 list = list[list != ""] #remove empty entry
80 list = gsub("-.+", "", list) #Remove isoform accession number (e.g. "-2")
81 return(list)
82 }
83
84 get_ref_pathways <- function(species){
85 ##all available pathways for the species
86 pathways <-keggLink("pathway", species)
87 tot_path<-unique(pathways)
88
89 ##formating the dat into a list object
90 ##key= pathway ID, value = genes of the pathway in the kegg format
91 pathways_list <- sapply(tot_path, function(pathway) names(which(pathways==pathway)))
92 return (pathways_list)
93 }
94
95 mapping_summary <- function(pv.out,species,id,id_type,pathways_list,geneID,uniprotID,mapped2geneID){
96 ref_pathways = get_ref_pathways(species)
97 names(ref_pathways) <- sapply(names(ref_pathways), function(x) gsub("path:[a-z]{3}","",x),USE.NAMES = F)
98
99 #genes present in pathway
100 genes = ref_pathways[id][[1]]
101 nb_genes = length(genes)
102
103 #genes mapped on pathway genes
104 mapped <- unlist(sapply(pv.out$plot.data.gene$all.mapped, function(x) strsplit(x,",")),use.names = F)
105 mapped = unique(mapped[mapped!=""])
106 nb_mapped <- length(mapped)
107
108 #compue ratio of mapping
109 ratio = round((nb_mapped/nb_genes)*100, 2)
110 if (is.nan(ratio)) { ratio = ""}
111 pathway_id = paste(species,id,sep="")
112 pathway_name = as.character(pathways_list[pathways_list[,1]==pathway_id,][2])
113
114 if (id_type=="geneid" || id_type=="keggid") {
115 row <- c(pathway_id,pathway_name,length(unique(geneID)),nb_mapped,nb_genes,ratio,paste(mapped,collapse=";"))
116 names(row) <- c("KEGG pathway ID","pathway name","nb of Entrez gene ID used","nb of Entrez gene ID mapped",
117 "nb of Entrez gene ID in the pathway", "ratio of Entrez gene ID mapped (%)","Entrez gene ID mapped")
118 } else if (id_type=="uniprotid") {
119 row <- c(pathway_id,pathway_name,length(unique(uniprotID)),length(unique(geneID)),nb_mapped,nb_genes,ratio,paste(mapped,collapse=";"),paste(mapped2geneID[which(mapped2geneID[,2] %in% mapped)],collapse=";"))
120 names(row) <- c("KEGG pathway ID","pathway name","nb of Uniprot_AC used","nb of Entrez gene ID used","nb of Entrez gene ID mapped",
121 "nb of Entrez gene ID in the pathway", "ratio of Entrez gene ID mapped (%)","Entrez gene ID mapped","uniprot_AC mapped")
122 }
123 return(row)
124 }
125
126 #take data frame, return data frame
127 split_ids_per_line <- function(line,ncol){
128
129 #print (line)
130 header = colnames(line)
131 line[ncol] = gsub("[[:blank:]]|\u00A0","",line[ncol])
132
133 if (length(unlist(strsplit(as.character(line[ncol]),";")))>1) {
134 if (length(line)==1 ) {
135 lines = as.data.frame(unlist(strsplit(as.character(line[ncol]),";")),stringsAsFactors = F)
136 } else {
137 if (ncol==1) { #first column
138 lines = suppressWarnings(cbind(unlist(strsplit(as.character(line[ncol]),";")), line[2:length(line)]))
139 } else if (ncol==length(line)) { #last column
140 lines = suppressWarnings(cbind(line[1:ncol-1],unlist(strsplit(as.character(line[ncol]),";"))))
141 } else {
142 lines = suppressWarnings(cbind(line[1:ncol-1], unlist(strsplit(as.character(line[ncol]),";"),use.names = F), line[(ncol+1):length(line)]))
143 }
144 }
145 colnames(lines)=header
146 return(lines)
147 } else {
148 return(line)
149 }
150 }
151
152 #create new lines if there's more than one id per cell in the columns in order to have only one id per line
153 one_id_one_line <-function(tab,ncol){
154
155 if (ncol(tab)>1){
156
157 tab[,ncol] = sapply(tab[,ncol],function(x) gsub("[[:blank:]]","",x))
158 header=colnames(tab)
159 res=as.data.frame(matrix(ncol=ncol(tab),nrow=0))
160 for (i in 1:nrow(tab) ) {
161 lines = split_ids_per_line(tab[i,],ncol)
162 res = rbind(res,lines)
163 }
164 }else {
165 res = unlist(sapply(tab[,1],function(x) strsplit(x,";")),use.names = F)
166 res = data.frame(res[which(!is.na(res[res!=""]))],stringsAsFactors = F)
167 colnames(res)=colnames(tab)
168 }
169 return(res)
170 }
171
172 get_args <- function(){
173
174 ## Collect arguments
175 args <- commandArgs(TRUE)
176
177 ## Default setting when no arguments passed
178 if(length(args) < 1) {
179 args <- c("--help")
180 }
181
182 ## Help section
183 if("--help" %in% args) {
184 cat("Pathview R script
185 Arguments:
186 --help Print this test
187 --input path of the input file (must contains a colum of uniprot and/or geneID accession number)
188 --id_list list of ids to use, ',' separated
189 --pathways_id Id(s) of pathway(s) to use, if several, semicolon separated list : hsa00010;hsa05412
190 --id_type Type of accession number ('uniprotID' or 'geneID')
191 --id_column Column containing accesion number of interest (ex : 'c1')
192 --header Boolean, TRUE if header FALSE if not
193 --output Output filename
194 --fold_change_col Column(s) containing fold change values (comma separated)
195 --native_kegg TRUE : native KEGG graph, FALSE : Graphviz graph
196 --species KEGG species (hsa, mmu, ...)
197 --pathways_input Tab with pathways in a column, output format of find_pathways
198 --pathway_col Column of pathways to use
199 --header2 Boolean, TRUE if header FALSE if not
200 --pathways_list path of file containg the species pathways list (hsa_pathways.loc, mmu_pathways.loc, ...)
201
202 Example:
203 ./PathView.R --input 'input.csv' --pathway_id '05412' --id_type 'uniprotID' --id_column 'c1' --header TRUE \n\n")
204
205 q(save="no")
206 }
207
208 parseArgs <- function(x) strsplit(sub("^--", "", x), "=")
209 argsDF <- as.data.frame(do.call("rbind", parseArgs(args)))
210 args <- as.list(as.character(argsDF$V2))
211 names(args) <- argsDF$V1
212
213 return(args)
214 }
215
216 main <- function(){
217
218 args <- get_args()
219
220 #save(args,file="/home/dchristiany/proteore_project/ProteoRE/tools/kegg_maps_visualization/args.Rda")
221 #load("/home/dchristiany/proteore_project/ProteoRE/tools/kegg_maps_visualization/args.Rda")
222
223 ###setting variables
224 if (!is.null(args$pathways_id)) {
225 ids <- get_list_from_cp(clean_bad_character(args$pathways_id))
226 ids <- sapply(ids, function(x) remove_kegg_prefix(x),USE.NAMES = FALSE)
227 }else if (!is.null(args$pathways_input)){
228 header2 <- str2bool(args$header2)
229 pathway_col <- as.numeric(gsub("c", "" ,args$pathway_col))
230 pathways_file = read_file(args$pathways_input,header2)
231 ids <- sapply(rapply(strsplit(clean_bad_character(pathways_file[,pathway_col]),","),c), function(x) remove_kegg_prefix(x),USE.NAMES = FALSE)
232 }
233 pathways_list <- read_file(args$pathways_list,F)
234 if (!is.null(args$id_list)) {
235 id_list <- get_list_from_cp(args$id_list)
236 }
237 id_type <- tolower(args$id_type)
238 ncol <- as.numeric(gsub("c", "" ,args$id_column))
239 header <- str2bool(args$header)
240 native_kegg <- str2bool(args$native_kegg)
241 species=args$species
242 fold_change_data = str2bool(args$fold_change_data)
243
244 #org list used in mapped2geneID
245 org <- c('Hs','Mm','Rn')
246 names(org) <- c('hsa','mmu','rno')
247
248 #read input file or list
249 if (!is.null(args$input)){
250 tab <- read_file(args$input,header)
251 tab <- data.frame(tab[which(tab[ncol]!=""),],stringsAsFactors = F)
252 tab = one_id_one_line(tab,ncol)
253 } else {
254 id_list = gsub("[[:blank:]]|\u00A0|NA","",id_list)
255 id_list = unique(id_list[id_list!=""])
256 tab <- data.frame(id_list,stringsAsFactors = F)
257 ncol=1
258 }
259
260
261 ##### map uniprotID to entrez geneID and kegg to geneID
262 uniprotID=""
263 mapped2geneID=""
264 if (id_type == "uniprotid") {
265 uniprotID=tab[,ncol]
266 mapped2geneID = id2eg(ids = uniprotID, category = "uniprot", org = org[[species]], pkg.name = NULL)
267 geneID = mapped2geneID[,2]
268 tab = cbind(tab,geneID)
269 ncol=ncol(tab)
270 }else if (id_type == "keggid"){
271 keggID = tab[,ncol]
272 geneID = kegg_to_geneID(keggID)
273 tab = cbind(tab,geneID)
274 ncol=ncol(tab)
275 }else if (id_type == "geneid"){
276 colnames(tab)[ncol] <- "geneID"
277 }
278
279 ##### build matrix to map on KEGG pathway (kgml : KEGG xml)
280 geneID_indices = which(!is.na(tab$geneID))
281 if (fold_change_data) {
282 fold_change <- as.integer(unlist(strsplit(gsub("c","",args$fold_change_col),",")))
283 if (length(fold_change) > 3) { fold_change= fold_change[1:3] }
284 if (length(fold_change)==1){
285 tab[,fold_change] <- as.double(gsub(",",".",as.character(tab[,fold_change]) ))
286 } else {
287 tab[,fold_change] <- apply(tab[,fold_change],2,function(x) as.double(gsub(",",".",as.character(x))))
288 }
289 mat = tab[geneID_indices,c(ncol,fold_change)]
290 mat = mat[(!duplicated(mat$geneID)),]
291 geneID=mat$geneID
292 mat = as.data.frame(mat[,-1])
293 row.names(mat)=geneID
294 } else {
295 mat = unique(as.character(tab$geneID[!is.na(tab$geneID[tab$geneID!=""])]))
296 geneID=mat
297 }
298
299 #####mapping geneID (with or without expression values) on KEGG pathway
300 plot.col.key= TRUE
301 low_color = "green"
302 mid_color = "#F3F781" #yellow
303 high_color = "red"
304 if (!fold_change_data) {
305 plot.col.key= FALSE #if there's no exrepession data, we don't show the color key
306 high_color = "#81BEF7" #blue
307 }
308
309 #create graph(s) and text output
310 for (id in ids) {
311 suffix= get_suffix(pathways_list,species,id)
312 pv.out <- suppressMessages(pathview(gene.data = mat,
313 gene.idtype = "entrez",
314 pathway.id = id,
315 species = species,
316 kegg.dir = ".",
317 out.suffix=suffix,
318 kegg.native = native_kegg,
319 low = list(gene = low_color, cpd = "blue"),
320 mid = list(gene = mid_color, cpd = "transparent"),
321 high = list(gene = high_color, cpd = "yellow"),
322 na.col="#D8D8D8", #gray
323 cpd.data=NULL,
324 plot.col.key = plot.col.key,
325 pdf.size=c(9,9)))
326
327 if (is.list(pv.out)){
328
329 #creating text file
330 if (!exists("DF")) {
331 DF <- data.frame(t(mapping_summary(pv.out,species,id,id_type,pathways_list,geneID,uniprotID,mapped2geneID)),stringsAsFactors = F,check.names = F)
332 } else {
333 #print (mapping_summary(pv.out,species,id))
334 DF <- rbind(DF,data.frame(t(mapping_summary(pv.out,species,id,id_type,pathways_list,geneID,uniprotID,mapped2geneID)),stringsAsFactors = F,check.names = F))
335 }
336 }
337 }
338
339 DF <- as.data.frame(apply(DF, c(1,2), function(x) gsub("^$|^ $", NA, x))) #convert "" et " " to NA
340
341 #text file output
342 write.table(DF,file=args$output,quote=FALSE, sep='\t',row.names = FALSE, col.names = TRUE)
343 }
344
345 main()