Mercurial > repos > proteore > proteore_pathview_mapping
comparison kegg_pathways_visualization.R @ 12:9fe4a861601b draft
planemo upload commit 7e2bd28d27e13c402acd46500f64d5c117797aa7-dirty
| author | proteore |
|---|---|
| date | Fri, 09 Nov 2018 05:11:46 -0500 |
| parents | |
| children | da82872f5c80 |
comparison
equal
deleted
inserted
replaced
| 11:6d5c0ff2b0bd | 12:9fe4a861601b |
|---|---|
| 1 #!/usr/bin/Rscript | |
| 2 #Rscript made for mapping genesID on KEGG pathway with Pathview package | |
| 3 #input : csv file containing ids (uniprot or geneID) to map, plus parameters | |
| 4 #output : KEGG pathway : jpeg or pdf file. | |
| 5 | |
| 6 options(warn=-1) #TURN OFF WARNINGS !!!!!! | |
| 7 | |
| 8 suppressMessages(library("pathview")) | |
| 9 | |
| 10 read_file <- function(path,header){ | |
| 11 file <- try(read.csv(path,header=header, sep="\t",stringsAsFactors = FALSE, quote="\"", check.names = F),silent=TRUE) | |
| 12 if (inherits(file,"try-error")){ | |
| 13 stop("File not found !") | |
| 14 }else{ | |
| 15 return(file) | |
| 16 } | |
| 17 } | |
| 18 | |
| 19 ##### fuction to clean and concatenate pathway name (allow more flexibility for user input) | |
| 20 concat_string <- function(x){ | |
| 21 x <- gsub(" - .*","",x) | |
| 22 x <- gsub(" ","",x) | |
| 23 x <- gsub("-","",x) | |
| 24 x <- gsub("_","",x) | |
| 25 x <- gsub(",","",x) | |
| 26 x <- gsub("\\'","",x) | |
| 27 x <- gsub("\\(.*)","",x) | |
| 28 x <- gsub("\\/","",x) | |
| 29 x <- tolower(x) | |
| 30 return(x) | |
| 31 } | |
| 32 | |
| 33 #return output suffix (pathway name) from id kegg (ex : hsa:00010) | |
| 34 get_suffix <- function(pathways_list,species,id){ | |
| 35 suffix = gsub("/","or",pathways_list[pathways_list[,1]==paste(species,id,sep=""),2]) | |
| 36 suffix = gsub(" ","_",suffix) | |
| 37 if (nchar(suffix) > 50){ | |
| 38 suffix = substr(suffix,1,50) | |
| 39 } | |
| 40 return(suffix) | |
| 41 } | |
| 42 | |
| 43 str2bool <- function(x){ | |
| 44 if (any(is.element(c("t","true"),tolower(x)))){ | |
| 45 return (TRUE) | |
| 46 }else if (any(is.element(c("f","false"),tolower(x)))){ | |
| 47 return (FALSE) | |
| 48 }else{ | |
| 49 return(NULL) | |
| 50 } | |
| 51 } | |
| 52 | |
| 53 is.letter <- function(x) grepl("[[:alpha:]]", x) | |
| 54 | |
| 55 #### hsa00010 -> 00010 | |
| 56 remove_kegg_prefix <- function(x){ | |
| 57 x = gsub(":","",x) | |
| 58 if (substr(x,1,4) == 'path'){ | |
| 59 x=substr(x,5,nchar(x)) | |
| 60 } | |
| 61 if (is.letter(substr(x,1,3))){ | |
| 62 x <- substr(x,4,nchar(x)) | |
| 63 } | |
| 64 return(x) | |
| 65 } | |
| 66 | |
| 67 clean_bad_character <- function(string) { | |
| 68 string <- gsub("X","",string) | |
| 69 return(string) | |
| 70 } | |
| 71 | |
| 72 get_list_from_cp <-function(list){ | |
| 73 list = strsplit(list, "[ \t\n]+")[[1]] | |
| 74 list = list[list != ""] #remove empty entry | |
| 75 list = gsub("-.+", "", list) #Remove isoform accession number (e.g. "-2") | |
| 76 return(list) | |
| 77 } | |
| 78 | |
| 79 #return a summary from the mapping with pathview in a vector | |
| 80 mapping_summary <- function(pv.out,species,id,id_type){ | |
| 81 | |
| 82 mapped <- pv.out$plot.data.gene$kegg.names[which(pv.out$plot.data.gene$all.mapped!='')] | |
| 83 nb_mapped <- length(mapped) | |
| 84 nb_kegg_id <- length(unique(pv.out$plot.data.gene$kegg.names)) | |
| 85 ratio = round((nb_mapped/nb_kegg_id)*100, 2) | |
| 86 if (is.nan(ratio)) { ratio = ""} | |
| 87 pathway_id = paste(species,id,sep="") | |
| 88 pathway_name = as.character(pathways_list[pathways_list[,1]==pathway_id,][2]) | |
| 89 | |
| 90 if (id_type=="geneid"){ | |
| 91 row <- c(pathway_id,pathway_name,length(unique(geneID)),nb_kegg_id,nb_mapped,ratio,paste(mapped,collapse=";")) | |
| 92 names(row) <- c("KEGG pathway ID","pathway name","nb of Entrez gene ID used","nb of Entrez gene ID mapped", | |
| 93 "nb of Entrez gene ID in the pathway", "ratio of Entrez gene ID mapped (%)","Entrez gene ID mapped") | |
| 94 }else if (id_type=="uniprotid"){ | |
| 95 row <- c(pathway_id,pathway_name,length(unique(uniprotID)),length(unique(geneID)),nb_mapped,nb_kegg_id,ratio,paste(mapped,collapse=";"),paste(mapped2geneID[which(mapped2geneID[,2] %in% mapped)],collapse=";")) | |
| 96 names(row) <- c("KEGG pathway ID","pathway name","nb of Uniprot_AC used","nb of Entrez gene ID used","nb of Entrez gene ID mapped", | |
| 97 "nb of Entrez gene ID in the pathway", "ratio of Entrez gene ID mapped (%)","Entrez gene ID mapped","uniprot_AC mapped") | |
| 98 } | |
| 99 return(row) | |
| 100 } | |
| 101 | |
| 102 get_args <- function(){ | |
| 103 | |
| 104 ## Collect arguments | |
| 105 args <- commandArgs(TRUE) | |
| 106 | |
| 107 ## Default setting when no arguments passed | |
| 108 if(length(args) < 1) { | |
| 109 args <- c("--help") | |
| 110 } | |
| 111 | |
| 112 ## Help section | |
| 113 if("--help" %in% args) { | |
| 114 cat("Pathview R script | |
| 115 Arguments: | |
| 116 --help Print this test | |
| 117 --input path of the input file (must contains a colum of uniprot and/or geneID accession number) | |
| 118 --id_list list of ids to use, ',' separated | |
| 119 --pathways_id Id(s) of pathway(s) to use, if several, semicolon separated list : hsa00010;hsa05412 | |
| 120 --id_type Type of accession number ('uniprotID' or 'geneID') | |
| 121 --id_column Column containing accesion number of interest (ex : 'c1') | |
| 122 --header Boolean, TRUE if header FALSE if not | |
| 123 --output Output filename | |
| 124 --fold_change_col Column(s) containing fold change values (comma separated) | |
| 125 --native_kegg TRUE : native KEGG graph, FALSE : Graphviz graph | |
| 126 --species KEGG species (hsa, mmu, ...) | |
| 127 --pathways_input Tab with pathways in a column, output format of find_pathways | |
| 128 --pathway_col Column of pathways to use | |
| 129 --header2 Boolean, TRUE if header FALSE if not | |
| 130 --pathways_list path of file containg the species pathways list (hsa_pathways.loc, mmu_pathways.loc, ...) | |
| 131 | |
| 132 Example: | |
| 133 ./PathView.R --input 'input.csv' --pathway_id '05412' --id_type 'uniprotID' --id_column 'c1' --header TRUE \n\n") | |
| 134 | |
| 135 q(save="no") | |
| 136 } | |
| 137 | |
| 138 parseArgs <- function(x) strsplit(sub("^--", "", x), "=") | |
| 139 argsDF <- as.data.frame(do.call("rbind", parseArgs(args))) | |
| 140 args <- as.list(as.character(argsDF$V2)) | |
| 141 names(args) <- argsDF$V1 | |
| 142 | |
| 143 return(args) | |
| 144 } | |
| 145 | |
| 146 args <- get_args() | |
| 147 | |
| 148 #save(args,file="/home/dchristiany/proteore_project/ProteoRE/tools/kegg_pathways_visualization/args.Rda") | |
| 149 #load("/home/dchristiany/proteore_project/ProteoRE/tools/kegg_pathways_visualization/args.Rda") | |
| 150 | |
| 151 ###setting variables | |
| 152 if (!is.null(args$pathways_id)) { | |
| 153 ids <- get_list_from_cp(clean_bad_character(args$pathways_id)) | |
| 154 ids <- sapply(ids, function(x) remove_kegg_prefix(x),USE.NAMES = FALSE) | |
| 155 }else if (!is.null(args$pathways_input)){ | |
| 156 header2 <- str2bool(args$header2) | |
| 157 pathway_col <- as.numeric(gsub("c", "" ,args$pathway_col)) | |
| 158 pathways_file = read_file(args$pathways_input,header2) | |
| 159 ids <- sapply(rapply(strsplit(clean_bad_character(pathways_file[,pathway_col]),","),c), function(x) remove_kegg_prefix(x),USE.NAMES = FALSE) | |
| 160 } | |
| 161 pathways_list <- read_file(args$pathways_list,F) | |
| 162 if (!is.null(args$id_list)) { | |
| 163 id_list <- get_list_from_cp(args$id_list) | |
| 164 } | |
| 165 id_type <- tolower(args$id_type) | |
| 166 ncol <- as.numeric(gsub("c", "" ,args$id_column)) | |
| 167 header <- str2bool(args$header) | |
| 168 native_kegg <- str2bool(args$native_kegg) | |
| 169 species=args$species | |
| 170 fold_change_data = str2bool(args$fold_change_data) | |
| 171 | |
| 172 #org list used in mapped2geneID | |
| 173 org <- c('Hs','Mm','Rn') | |
| 174 names(org) <- c('hsa','mmu','rno') | |
| 175 | |
| 176 #read input file or list | |
| 177 if (!is.null(args$input)){ | |
| 178 tab <- read_file(args$input,header) | |
| 179 tab <- data.frame(tab[which(tab[ncol]!=""),]) | |
| 180 } else { | |
| 181 tab <- data.frame(id_list,stringsAsFactors = F) | |
| 182 ncol=1 | |
| 183 } | |
| 184 | |
| 185 #fold change columns | |
| 186 #make sure its double and name expression value columns | |
| 187 if (fold_change_data){ | |
| 188 fold_change <- as.integer(unlist(strsplit(gsub("c","",args$fold_change_col),","))) | |
| 189 if (length(fold_change) > 3) { fold_change= fold_change[1:3] } | |
| 190 for (i in 1:length(fold_change)) { | |
| 191 fc_col = fold_change[i] | |
| 192 colnames(tab)[fc_col] <- paste("e",i,sep='') | |
| 193 tab[,fc_col] <- as.double(gsub(",",".",as.character(tab[,fc_col]) )) | |
| 194 } | |
| 195 } | |
| 196 | |
| 197 ##### map uniprotID to entrez geneID | |
| 198 if (id_type == "uniprotid") { | |
| 199 uniprotID = tab[,ncol] | |
| 200 mapped2geneID = id2eg(ids = uniprotID, category = "uniprot", org = org[[species]], pkg.name = NULL) | |
| 201 geneID = mapped2geneID[,2] | |
| 202 tab = cbind(tab,geneID) | |
| 203 }else if (id_type == "geneid"){ | |
| 204 colnames(tab)[ncol] <- "geneID" | |
| 205 } | |
| 206 | |
| 207 geneID = as.character(tab$geneID[which(!is.na(tab$geneID))]) | |
| 208 geneID = gsub(" ","",geneID) | |
| 209 geneID = unlist(strsplit(geneID,"[;]")) | |
| 210 | |
| 211 ##### build matrix to map on KEGG pathway (kgml : KEGG xml) | |
| 212 if (fold_change_data) { | |
| 213 geneID_indices = which(!duplicated(geneID)) | |
| 214 if (length(fold_change) == 3){ | |
| 215 mat <- as.data.frame(cbind(tab$e1,tab$e2,tab$e3)[which(!is.na(tab$geneID)),]) | |
| 216 mat = mat[geneID_indices,] | |
| 217 row.names(mat) <- geneID[geneID_indices] | |
| 218 } else if (length(fold_change) == 2){ | |
| 219 mat <- as.data.frame(cbind(tab$e1,tab$e2)[which(!is.na(tab$geneID)),]) | |
| 220 mat = mat[geneID_indices,] | |
| 221 row.names(mat) <- geneID[geneID_indices] | |
| 222 } else { | |
| 223 mat <- as.data.frame(cbind(tab$e1)[which(!is.na(tab$geneID)),]) | |
| 224 mat = mat[geneID_indices,] | |
| 225 names(mat) <- geneID[geneID_indices] | |
| 226 } | |
| 227 } else { | |
| 228 mat <- geneID | |
| 229 } | |
| 230 | |
| 231 #####mapping geneID (with or without expression values) on KEGG pathway | |
| 232 plot.col.key= TRUE | |
| 233 low_color = "green" | |
| 234 mid_color = "#F3F781" #yellow | |
| 235 high_color = "red" | |
| 236 if (is.null(tab$e1)) { | |
| 237 plot.col.key= FALSE #if there's no exrepession data, we don't show the color key | |
| 238 high_color = "#81BEF7" #blue | |
| 239 } | |
| 240 | |
| 241 #create graph(s) and text output | |
| 242 for (id in ids) { | |
| 243 suffix= get_suffix(pathways_list,species,id) | |
| 244 pv.out <- suppressMessages(pathview(gene.data = mat, | |
| 245 gene.idtype = "entrez", | |
| 246 pathway.id = id, | |
| 247 species = species, | |
| 248 kegg.dir = ".", | |
| 249 out.suffix=suffix, | |
| 250 kegg.native = native_kegg, | |
| 251 low = list(gene = low_color, cpd = "blue"), | |
| 252 mid = list(gene = mid_color, cpd = "transparent"), | |
| 253 high = list(gene = high_color, cpd = "yellow"), | |
| 254 na.col="#D8D8D8", #gray | |
| 255 cpd.data=NULL, | |
| 256 plot.col.key = plot.col.key, | |
| 257 pdf.size=c(9,9))) | |
| 258 | |
| 259 if (is.list(pv.out)){ | |
| 260 | |
| 261 #creating text file | |
| 262 if (!exists("DF")) { | |
| 263 DF <- data.frame(t(mapping_summary(pv.out,species,id,id_type)),stringsAsFactors = F,check.names = F) | |
| 264 } else { | |
| 265 #print (mapping_summary(pv.out,species,id)) | |
| 266 DF <- rbind(DF,data.frame(t(mapping_summary(pv.out,species,id,id_type)),stringsAsFactors = F,check.names = F)) | |
| 267 } | |
| 268 } | |
| 269 | |
| 270 } | |
| 271 | |
| 272 DF <- as.data.frame(apply(DF, c(1,2), function(x) gsub("^$|^ $", NA, x))) #convert "" et " " to NA | |
| 273 | |
| 274 #text file output | |
| 275 write.table(DF,file=args$output,quote=FALSE, sep='\t',row.names = FALSE, col.names = TRUE) |
