comparison kegg_pathways_visualization.R @ 12:9fe4a861601b draft

planemo upload commit 7e2bd28d27e13c402acd46500f64d5c117797aa7-dirty
author proteore
date Fri, 09 Nov 2018 05:11:46 -0500
parents
children da82872f5c80
comparison
equal deleted inserted replaced
11:6d5c0ff2b0bd 12:9fe4a861601b
1 #!/usr/bin/Rscript
2 #Rscript made for mapping genesID on KEGG pathway with Pathview package
3 #input : csv file containing ids (uniprot or geneID) to map, plus parameters
4 #output : KEGG pathway : jpeg or pdf file.
5
6 options(warn=-1) #TURN OFF WARNINGS !!!!!!
7
8 suppressMessages(library("pathview"))
9
10 read_file <- function(path,header){
11 file <- try(read.csv(path,header=header, sep="\t",stringsAsFactors = FALSE, quote="\"", check.names = F),silent=TRUE)
12 if (inherits(file,"try-error")){
13 stop("File not found !")
14 }else{
15 return(file)
16 }
17 }
18
19 ##### fuction to clean and concatenate pathway name (allow more flexibility for user input)
20 concat_string <- function(x){
21 x <- gsub(" - .*","",x)
22 x <- gsub(" ","",x)
23 x <- gsub("-","",x)
24 x <- gsub("_","",x)
25 x <- gsub(",","",x)
26 x <- gsub("\\'","",x)
27 x <- gsub("\\(.*)","",x)
28 x <- gsub("\\/","",x)
29 x <- tolower(x)
30 return(x)
31 }
32
33 #return output suffix (pathway name) from id kegg (ex : hsa:00010)
34 get_suffix <- function(pathways_list,species,id){
35 suffix = gsub("/","or",pathways_list[pathways_list[,1]==paste(species,id,sep=""),2])
36 suffix = gsub(" ","_",suffix)
37 if (nchar(suffix) > 50){
38 suffix = substr(suffix,1,50)
39 }
40 return(suffix)
41 }
42
43 str2bool <- function(x){
44 if (any(is.element(c("t","true"),tolower(x)))){
45 return (TRUE)
46 }else if (any(is.element(c("f","false"),tolower(x)))){
47 return (FALSE)
48 }else{
49 return(NULL)
50 }
51 }
52
53 is.letter <- function(x) grepl("[[:alpha:]]", x)
54
55 #### hsa00010 -> 00010
56 remove_kegg_prefix <- function(x){
57 x = gsub(":","",x)
58 if (substr(x,1,4) == 'path'){
59 x=substr(x,5,nchar(x))
60 }
61 if (is.letter(substr(x,1,3))){
62 x <- substr(x,4,nchar(x))
63 }
64 return(x)
65 }
66
67 clean_bad_character <- function(string) {
68 string <- gsub("X","",string)
69 return(string)
70 }
71
72 get_list_from_cp <-function(list){
73 list = strsplit(list, "[ \t\n]+")[[1]]
74 list = list[list != ""] #remove empty entry
75 list = gsub("-.+", "", list) #Remove isoform accession number (e.g. "-2")
76 return(list)
77 }
78
79 #return a summary from the mapping with pathview in a vector
80 mapping_summary <- function(pv.out,species,id,id_type){
81
82 mapped <- pv.out$plot.data.gene$kegg.names[which(pv.out$plot.data.gene$all.mapped!='')]
83 nb_mapped <- length(mapped)
84 nb_kegg_id <- length(unique(pv.out$plot.data.gene$kegg.names))
85 ratio = round((nb_mapped/nb_kegg_id)*100, 2)
86 if (is.nan(ratio)) { ratio = ""}
87 pathway_id = paste(species,id,sep="")
88 pathway_name = as.character(pathways_list[pathways_list[,1]==pathway_id,][2])
89
90 if (id_type=="geneid"){
91 row <- c(pathway_id,pathway_name,length(unique(geneID)),nb_kegg_id,nb_mapped,ratio,paste(mapped,collapse=";"))
92 names(row) <- c("KEGG pathway ID","pathway name","nb of Entrez gene ID used","nb of Entrez gene ID mapped",
93 "nb of Entrez gene ID in the pathway", "ratio of Entrez gene ID mapped (%)","Entrez gene ID mapped")
94 }else if (id_type=="uniprotid"){
95 row <- c(pathway_id,pathway_name,length(unique(uniprotID)),length(unique(geneID)),nb_mapped,nb_kegg_id,ratio,paste(mapped,collapse=";"),paste(mapped2geneID[which(mapped2geneID[,2] %in% mapped)],collapse=";"))
96 names(row) <- c("KEGG pathway ID","pathway name","nb of Uniprot_AC used","nb of Entrez gene ID used","nb of Entrez gene ID mapped",
97 "nb of Entrez gene ID in the pathway", "ratio of Entrez gene ID mapped (%)","Entrez gene ID mapped","uniprot_AC mapped")
98 }
99 return(row)
100 }
101
102 get_args <- function(){
103
104 ## Collect arguments
105 args <- commandArgs(TRUE)
106
107 ## Default setting when no arguments passed
108 if(length(args) < 1) {
109 args <- c("--help")
110 }
111
112 ## Help section
113 if("--help" %in% args) {
114 cat("Pathview R script
115 Arguments:
116 --help Print this test
117 --input path of the input file (must contains a colum of uniprot and/or geneID accession number)
118 --id_list list of ids to use, ',' separated
119 --pathways_id Id(s) of pathway(s) to use, if several, semicolon separated list : hsa00010;hsa05412
120 --id_type Type of accession number ('uniprotID' or 'geneID')
121 --id_column Column containing accesion number of interest (ex : 'c1')
122 --header Boolean, TRUE if header FALSE if not
123 --output Output filename
124 --fold_change_col Column(s) containing fold change values (comma separated)
125 --native_kegg TRUE : native KEGG graph, FALSE : Graphviz graph
126 --species KEGG species (hsa, mmu, ...)
127 --pathways_input Tab with pathways in a column, output format of find_pathways
128 --pathway_col Column of pathways to use
129 --header2 Boolean, TRUE if header FALSE if not
130 --pathways_list path of file containg the species pathways list (hsa_pathways.loc, mmu_pathways.loc, ...)
131
132 Example:
133 ./PathView.R --input 'input.csv' --pathway_id '05412' --id_type 'uniprotID' --id_column 'c1' --header TRUE \n\n")
134
135 q(save="no")
136 }
137
138 parseArgs <- function(x) strsplit(sub("^--", "", x), "=")
139 argsDF <- as.data.frame(do.call("rbind", parseArgs(args)))
140 args <- as.list(as.character(argsDF$V2))
141 names(args) <- argsDF$V1
142
143 return(args)
144 }
145
146 args <- get_args()
147
148 #save(args,file="/home/dchristiany/proteore_project/ProteoRE/tools/kegg_pathways_visualization/args.Rda")
149 #load("/home/dchristiany/proteore_project/ProteoRE/tools/kegg_pathways_visualization/args.Rda")
150
151 ###setting variables
152 if (!is.null(args$pathways_id)) {
153 ids <- get_list_from_cp(clean_bad_character(args$pathways_id))
154 ids <- sapply(ids, function(x) remove_kegg_prefix(x),USE.NAMES = FALSE)
155 }else if (!is.null(args$pathways_input)){
156 header2 <- str2bool(args$header2)
157 pathway_col <- as.numeric(gsub("c", "" ,args$pathway_col))
158 pathways_file = read_file(args$pathways_input,header2)
159 ids <- sapply(rapply(strsplit(clean_bad_character(pathways_file[,pathway_col]),","),c), function(x) remove_kegg_prefix(x),USE.NAMES = FALSE)
160 }
161 pathways_list <- read_file(args$pathways_list,F)
162 if (!is.null(args$id_list)) {
163 id_list <- get_list_from_cp(args$id_list)
164 }
165 id_type <- tolower(args$id_type)
166 ncol <- as.numeric(gsub("c", "" ,args$id_column))
167 header <- str2bool(args$header)
168 native_kegg <- str2bool(args$native_kegg)
169 species=args$species
170 fold_change_data = str2bool(args$fold_change_data)
171
172 #org list used in mapped2geneID
173 org <- c('Hs','Mm','Rn')
174 names(org) <- c('hsa','mmu','rno')
175
176 #read input file or list
177 if (!is.null(args$input)){
178 tab <- read_file(args$input,header)
179 tab <- data.frame(tab[which(tab[ncol]!=""),])
180 } else {
181 tab <- data.frame(id_list,stringsAsFactors = F)
182 ncol=1
183 }
184
185 #fold change columns
186 #make sure its double and name expression value columns
187 if (fold_change_data){
188 fold_change <- as.integer(unlist(strsplit(gsub("c","",args$fold_change_col),",")))
189 if (length(fold_change) > 3) { fold_change= fold_change[1:3] }
190 for (i in 1:length(fold_change)) {
191 fc_col = fold_change[i]
192 colnames(tab)[fc_col] <- paste("e",i,sep='')
193 tab[,fc_col] <- as.double(gsub(",",".",as.character(tab[,fc_col]) ))
194 }
195 }
196
197 ##### map uniprotID to entrez geneID
198 if (id_type == "uniprotid") {
199 uniprotID = tab[,ncol]
200 mapped2geneID = id2eg(ids = uniprotID, category = "uniprot", org = org[[species]], pkg.name = NULL)
201 geneID = mapped2geneID[,2]
202 tab = cbind(tab,geneID)
203 }else if (id_type == "geneid"){
204 colnames(tab)[ncol] <- "geneID"
205 }
206
207 geneID = as.character(tab$geneID[which(!is.na(tab$geneID))])
208 geneID = gsub(" ","",geneID)
209 geneID = unlist(strsplit(geneID,"[;]"))
210
211 ##### build matrix to map on KEGG pathway (kgml : KEGG xml)
212 if (fold_change_data) {
213 geneID_indices = which(!duplicated(geneID))
214 if (length(fold_change) == 3){
215 mat <- as.data.frame(cbind(tab$e1,tab$e2,tab$e3)[which(!is.na(tab$geneID)),])
216 mat = mat[geneID_indices,]
217 row.names(mat) <- geneID[geneID_indices]
218 } else if (length(fold_change) == 2){
219 mat <- as.data.frame(cbind(tab$e1,tab$e2)[which(!is.na(tab$geneID)),])
220 mat = mat[geneID_indices,]
221 row.names(mat) <- geneID[geneID_indices]
222 } else {
223 mat <- as.data.frame(cbind(tab$e1)[which(!is.na(tab$geneID)),])
224 mat = mat[geneID_indices,]
225 names(mat) <- geneID[geneID_indices]
226 }
227 } else {
228 mat <- geneID
229 }
230
231 #####mapping geneID (with or without expression values) on KEGG pathway
232 plot.col.key= TRUE
233 low_color = "green"
234 mid_color = "#F3F781" #yellow
235 high_color = "red"
236 if (is.null(tab$e1)) {
237 plot.col.key= FALSE #if there's no exrepession data, we don't show the color key
238 high_color = "#81BEF7" #blue
239 }
240
241 #create graph(s) and text output
242 for (id in ids) {
243 suffix= get_suffix(pathways_list,species,id)
244 pv.out <- suppressMessages(pathview(gene.data = mat,
245 gene.idtype = "entrez",
246 pathway.id = id,
247 species = species,
248 kegg.dir = ".",
249 out.suffix=suffix,
250 kegg.native = native_kegg,
251 low = list(gene = low_color, cpd = "blue"),
252 mid = list(gene = mid_color, cpd = "transparent"),
253 high = list(gene = high_color, cpd = "yellow"),
254 na.col="#D8D8D8", #gray
255 cpd.data=NULL,
256 plot.col.key = plot.col.key,
257 pdf.size=c(9,9)))
258
259 if (is.list(pv.out)){
260
261 #creating text file
262 if (!exists("DF")) {
263 DF <- data.frame(t(mapping_summary(pv.out,species,id,id_type)),stringsAsFactors = F,check.names = F)
264 } else {
265 #print (mapping_summary(pv.out,species,id))
266 DF <- rbind(DF,data.frame(t(mapping_summary(pv.out,species,id,id_type)),stringsAsFactors = F,check.names = F))
267 }
268 }
269
270 }
271
272 DF <- as.data.frame(apply(DF, c(1,2), function(x) gsub("^$|^ $", NA, x))) #convert "" et " " to NA
273
274 #text file output
275 write.table(DF,file=args$output,quote=FALSE, sep='\t',row.names = FALSE, col.names = TRUE)