Mercurial > repos > proteore > proteore_maps_visualization
comparison kegg_maps_visualization.R @ 12:10c572ad2b9e draft default tip
"planemo upload commit 209e9e48505e55a2a2b4b1d7b053f557344bc2e8-dirty"
| author | proteore |
|---|---|
| date | Mon, 10 May 2021 12:45:44 +0000 |
| parents | 7d04efde2526 |
| children |
comparison
equal
deleted
inserted
replaced
| 11:7d04efde2526 | 12:10c572ad2b9e |
|---|---|
| 1 #!/usr/bin/Rscript | 1 #!/usr/bin/Rscript |
| 2 #Rscript made for mapping genesID on KEGG pathway with Pathview package | 2 #Rscript made for mapping genesID on KEGG pathway with Pathview package |
| 3 #input : csv file containing ids (uniprot or geneID) to map, plus parameters | 3 #input : csv file containing ids (uniprot or geneid) to map, plus parameters |
| 4 #output : KEGG pathway : jpeg or pdf file. | 4 #output : KEGG pathway : jpeg or pdf file. |
| 5 | 5 |
| 6 options(warn=-1) #TURN OFF WARNINGS !!!!!! | 6 options(warn = -1) #TURN OFF WARNINGS !!!!!! |
| 7 suppressMessages(library("pathview")) | 7 suppressMessages(library("pathview")) |
| 8 suppressMessages(library(KEGGREST)) | 8 suppressMessages(library(KEGGREST)) |
| 9 | 9 |
| 10 read_file <- function(path,header){ | 10 read_file <- function(path, header) { |
| 11 file <- try(read.csv(path,header=header, sep="\t",stringsAsFactors = FALSE, quote="\"", check.names = F, comment.char = ""),silent=TRUE) | 11 file <- try(read.csv(path, header = header, sep = "\t", |
| 12 if (inherits(file,"try-error")){ | 12 stringsAsFactors = FALSE, quote = "\"", check.names = F, |
| 13 stop("Read file error ! Please check your file (header, # character, etc) ") | 13 comment.char = ""), silent = TRUE) |
| 14 }else{ | 14 if (inherits(file, "try-error")) { |
| 15 stop("Read file error! Please check your file (header, # character, etc)") | |
| 16 }else { | |
| 15 return(file) | 17 return(file) |
| 16 } | 18 } |
| 17 } | 19 } |
| 18 | 20 |
| 19 ##### fuction to clean and concatenate pathway name (allow more flexibility for user input) | 21 ##### fuction to clean and concatenate pathway name |
| 20 concat_string <- function(x){ | 22 ##### (allow more flexibility for user input) |
| 21 x <- gsub(" - .*","",x) | 23 concat_string <- function(x) { |
| 22 x <- gsub(" ","",x) | 24 x <- gsub(" - .*", "", x) |
| 23 x <- gsub("-","",x) | 25 x <- gsub(" ", "", x) |
| 24 x <- gsub("_","",x) | 26 x <- gsub("-", "", x) |
| 25 x <- gsub(",","",x) | 27 x <- gsub("_", "", x) |
| 26 x <- gsub("\\'","",x) | 28 x <- gsub(",", "", x) |
| 27 x <- gsub("\\(.*)","",x) | 29 x <- gsub("\\'", "", x) |
| 28 x <- gsub("\\/","",x) | 30 x <- gsub("\\(.*)", "", x) |
| 31 x <- gsub("\\/", "", x) | |
| 29 x <- tolower(x) | 32 x <- tolower(x) |
| 30 return(x) | 33 return(x) |
| 31 } | 34 } |
| 32 | 35 |
| 33 #return output suffix (pathway name) from id kegg (ex : hsa:00010) | 36 #return output suffix (pathway name) from id kegg (ex : hsa:00010) |
| 34 get_suffix <- function(pathways_list,species,id){ | 37 get_suffix <- function(pathways_list, species, id) { |
| 35 suffix = gsub("/","or",pathways_list[pathways_list[,1]==paste(species,id,sep=""),2]) | 38 suffix <- gsub("/", "or", pathways_list[pathways_list[, 1] == paste( |
| 36 suffix = gsub(" ","_",suffix) | 39 species, id, sep = ""), 2]) |
| 37 if (nchar(suffix) > 50){ | 40 suffix <- gsub(" ", "_", suffix) |
| 38 suffix = substr(suffix,1,50) | 41 if (nchar(suffix) > 50) { |
| 42 suffix <- substr(suffix, 1, 50) | |
| 39 } | 43 } |
| 40 return(suffix) | 44 return(suffix) |
| 41 } | 45 } |
| 42 | 46 |
| 43 str2bool <- function(x){ | 47 str2bool <- function(x) { |
| 44 if (any(is.element(c("t","true"),tolower(x)))){ | 48 if (any(is.element(c("t", "true"), tolower(x)))) { |
| 45 return (TRUE) | 49 return(TRUE) |
| 46 }else if (any(is.element(c("f","false"),tolower(x)))){ | 50 }else if (any(is.element(c("f", "false"), tolower(x)))) { |
| 47 return (FALSE) | 51 return(FALSE) |
| 48 }else{ | 52 }else { |
| 49 return(NULL) | 53 return(NULL) |
| 50 } | 54 } |
| 51 } | 55 } |
| 52 | 56 |
| 53 is.letter <- function(x) grepl("[[:alpha:]]", x) | 57 is_letter <- function(x) grepl("[[:alpha:]]", x) |
| 54 | 58 |
| 55 #### hsa00010 -> 00010 | 59 remove_kegg_prefix <- function(x) { |
| 56 remove_kegg_prefix <- function(x){ | 60 x <- gsub(":", "", x) |
| 57 x = gsub(":","",x) | 61 if (substr(x, 1, 4) == "path") { |
| 58 if (substr(x,1,4) == 'path'){ | 62 x <- substr(x, 5, nchar(x)) |
| 59 x=substr(x,5,nchar(x)) | 63 } |
| 60 } | 64 if (is_letter(substr(x, 1, 3))) { |
| 61 if (is.letter(substr(x,1,3))){ | 65 x <- substr(x, 4, nchar(x)) |
| 62 x <- substr(x,4,nchar(x)) | |
| 63 } | 66 } |
| 64 return(x) | 67 return(x) |
| 65 } | 68 } |
| 66 | 69 |
| 67 kegg_to_geneID <- function(vector){ | 70 kegg_to_geneid <- function(vector) { |
| 68 vector <- sapply(vector, function(x) unlist(strsplit(x,":"))[2],USE.NAMES = F) | 71 #nolint start |
| 69 return (vector) | 72 vector <- sapply(vector, function(x) unlist( |
| 73 strsplit(x, ":"))[2], USE.NAMES = F) | |
| 74 #nolint end | |
| 75 return(vector) | |
| 70 } | 76 } |
| 71 | 77 |
| 72 clean_bad_character <- function(string) { | 78 clean_bad_character <- function(string) { |
| 73 string <- gsub("X","",string) | 79 string <- gsub("X", "", string) |
| 74 return(string) | 80 return(string) |
| 75 } | 81 } |
| 76 | 82 |
| 77 get_list_from_cp <-function(list){ | 83 get_list_from_cp <- function(list) { |
| 78 list = gsub(";","\t",list) | 84 list <- gsub(";", "\t", list) |
| 79 list = gsub(",","\t",list) | 85 list <- gsub(",", "\t", list) |
| 80 list = strsplit(list, "[ \t\n]+")[[1]] | 86 list <- strsplit(list, "[ \t\n]+")[[1]] |
| 81 list = list[list != ""] #remove empty entry | 87 list <- list[list != ""] #remove empty entry |
| 82 list = gsub("-.+", "", list) #Remove isoform accession number (e.g. "-2") | 88 list <- gsub("-.+", "", list) #Remove isoform accession number (e.g. "-2") |
| 83 return(list) | 89 return(list) |
| 84 } | 90 } |
| 85 | 91 |
| 86 get_ref_pathways <- function(species){ | 92 get_ref_pathways <- function(species) { |
| 87 ##all available pathways for the species | 93 ##all available pathways for the species |
| 88 pathways <- keggLink("pathway", species) | 94 pathways <- keggLink("pathway", species) |
| 89 tot_path <- unique(pathways) | 95 tot_path <- unique(pathways) |
| 90 | 96 |
| 91 ##formating the dat into a list object | 97 ##formating the dat into a list object |
| 92 ##key= pathway ID, value = genes of the pathway in the kegg format | 98 ##key= pathway ID, value = genes of the pathway in the kegg format |
| 93 pathways_list <- sapply(tot_path, function(pathway) names(which(pathways==pathway))) | 99 pathways_list <- sapply(tot_path, function(pathway) |
| 94 return (pathways_list) | 100 names(which(pathways == pathway))) |
| 95 } | 101 return(pathways_list) |
| 96 | 102 } |
| 97 mapping_summary <- function(pv.out,species,id,id_type,pathways_list,geneID,uniprotID,mapped2geneID){ | 103 |
| 98 ref_pathways = get_ref_pathways(species) | 104 #nolint start |
| 99 names(ref_pathways) <- sapply(names(ref_pathways), function(x) gsub("path:[a-z]{3}","",x),USE.NAMES = F) | 105 mapping_summary <- function(pv_out, species, id, id_type, pathways_list, |
| 100 | 106 geneid, uniprotid, mapped2geneid) { |
| 107 ref_pathways <- get_ref_pathways(species) | |
| 108 names(ref_pathways) <- sapply(names(ref_pathways), function(x) | |
| 109 gsub("path:[a-z]{3}", "", x), USE.NAMES = F) | |
| 110 #nolint end | |
| 101 #genes present in pathway | 111 #genes present in pathway |
| 102 genes = ref_pathways[id][[1]] | 112 genes <- ref_pathways[id][[1]] |
| 103 nb_genes = length(genes) | 113 nb_genes <- length(genes) |
| 104 | 114 |
| 105 #genes mapped on pathway genes | 115 #genes mapped on pathway genes |
| 106 mapped <- unlist(sapply(pv.out$plot.data.gene$all.mapped, function(x) strsplit(x,",")),use.names = F) | 116 mapped <- unlist(sapply(pv_out$plot.data.gene$all.mapped, |
| 107 mapped = unique(mapped[mapped!=""]) | 117 function(x) strsplit(x, ",")), use.names = F) |
| 118 mapped <- unique(mapped[mapped != ""]) | |
| 108 nb_mapped <- length(mapped) | 119 nb_mapped <- length(mapped) |
| 109 | 120 |
| 110 #compue ratio of mapping | 121 #compute ratio of mapping |
| 111 ratio = round((nb_mapped/nb_genes)*100, 2) | 122 ratio <- round((nb_mapped / nb_genes) * 100, 2) |
| 112 if (is.nan(ratio)) { ratio = ""} | 123 if (is.nan(ratio)) { |
| 113 pathway_id = paste(species,id,sep="") | 124 ratio <- "" |
| 114 pathway_name = as.character(pathways_list[pathways_list[,1]==pathway_id,][2]) | 125 } |
| 115 | 126 pathway_id <- paste(species, id, sep = "") |
| 116 if (id_type=="geneid" || id_type=="keggid") { | 127 pathway_name <- as.character( |
| 117 row <- c(pathway_id,pathway_name,length(unique(geneID)),nb_mapped,nb_genes,ratio,paste(mapped,collapse=";")) | 128 pathways_list[pathways_list[, 1] == pathway_id, ][2]) |
| 118 names(row) <- c("KEGG pathway ID","pathway name","nb of Entrez gene ID used","nb of Entrez gene ID mapped", | 129 |
| 119 "nb of Entrez gene ID in the pathway", "ratio of Entrez gene ID mapped (%)","Entrez gene ID mapped") | 130 if (id_type == "geneid" || id_type == "keggid") { |
| 120 } else if (id_type=="uniprotid") { | 131 row <- c(pathway_id, pathway_name, length(unique(geneid)), |
| 121 row <- c(pathway_id,pathway_name,length(unique(uniprotID)),length(unique(geneID)),nb_mapped,nb_genes,ratio,paste(mapped,collapse=";"),paste(mapped2geneID[which(mapped2geneID[,2] %in% mapped)],collapse=";")) | 132 nb_mapped, nb_genes, ratio, paste(mapped, collapse = ";")) |
| 122 names(row) <- c("KEGG pathway ID","pathway name","nb of Uniprot_AC used","nb of Entrez gene ID used","nb of Entrez gene ID mapped", | 133 names(row) <- c("KEGG pathway ID", "pathway name", |
| 123 "nb of Entrez gene ID in the pathway", "ratio of Entrez gene ID mapped (%)","Entrez gene ID mapped","uniprot_AC mapped") | 134 "nb of Entrez gene ID used", "nb of Entrez gene ID mapped", |
| 124 } | 135 "nb of Entrez gene ID in the pathway", |
| 136 "ratio of Entrez gene ID mapped (%)", | |
| 137 "Entrez gene ID mapped") | |
| 138 } else if (id_type == "uniprotid") { | |
| 139 row <- c(pathway_id, pathway_name, length(unique(uniprotid)), | |
| 140 length(unique(geneid)), nb_mapped, nb_genes, ratio, | |
| 141 paste(mapped, collapse = ";"), | |
| 142 paste(mapped2geneid[which(mapped2geneid[, 2] %in% mapped)], | |
| 143 collapse = ";")) | |
| 144 names(row) <- c("KEGG pathway ID", "pathway name", "nb of Uniprot_AC used", | |
| 145 "nb of Entrez gene ID used", "nb of Entrez gene ID mapped", | |
| 146 "nb of Entrez gene ID in the pathway", | |
| 147 "ratio of Entrez gene ID mapped (%)", | |
| 148 "Entrez gene ID mapped", "uniprot_AC mapped") | |
| 149 } | |
| 125 return(row) | 150 return(row) |
| 126 } | 151 } |
| 127 | 152 |
| 128 #take data frame, return data frame | 153 #take data frame, return data frame |
| 129 split_ids_per_line <- function(line,ncol){ | 154 split_ids_per_line <- function(line, ncol) { |
| 130 | 155 |
| 131 #print (line) | 156 #print (line) |
| 132 header = colnames(line) | 157 header <- colnames(line) |
| 133 line[ncol] = gsub("[[:blank:]]|\u00A0","",line[ncol]) | 158 line[ncol] <- gsub("[[:blank:]]|\u00A0", "", line[ncol]) |
| 134 | 159 |
| 135 if (length(unlist(strsplit(as.character(line[ncol]),";")))>1) { | 160 if (length(unlist(strsplit(as.character(line[ncol]), ";"))) > 1) { |
| 136 if (length(line)==1 ) { | 161 if (length(line) == 1) { |
| 137 lines = as.data.frame(unlist(strsplit(as.character(line[ncol]),";")),stringsAsFactors = F) | 162 lines <- as.data.frame(unlist(strsplit(as.character(line[ncol]), ";")), |
| 163 stringsAsFactors = F) | |
| 138 } else { | 164 } else { |
| 139 if (ncol==1) { #first column | 165 if (ncol == 1) { #first column |
| 140 lines = suppressWarnings(cbind(unlist(strsplit(as.character(line[ncol]),";")), line[2:length(line)])) | 166 lines <- suppressWarnings(cbind(unlist(strsplit( |
| 141 } else if (ncol==length(line)) { #last column | 167 as.character(line[ncol]), ";")), line[2:length(line)])) |
| 142 lines = suppressWarnings(cbind(line[1:ncol-1],unlist(strsplit(as.character(line[ncol]),";")))) | 168 } else if (ncol == length(line)) { #last column |
| 169 lines <- suppressWarnings(cbind(line[1:ncol - 1], | |
| 170 unlist(strsplit(as.character(line[ncol]), ";")))) | |
| 143 } else { | 171 } else { |
| 144 lines = suppressWarnings(cbind(line[1:ncol-1], unlist(strsplit(as.character(line[ncol]),";"),use.names = F), line[(ncol+1):length(line)])) | 172 lines <- suppressWarnings(cbind(line[1:ncol - 1], |
| 173 unlist(strsplit(as.character(line[ncol]), ";"), use.names = F), | |
| 174 line[(ncol + 1):length(line)])) | |
| 145 } | 175 } |
| 146 } | 176 } |
| 147 colnames(lines)=header | 177 colnames(lines) <- header |
| 148 return(lines) | 178 return(lines) |
| 149 } else { | 179 } else { |
| 150 return(line) | 180 return(line) |
| 151 } | 181 } |
| 152 } | 182 } |
| 153 | 183 |
| 154 #create new lines if there's more than one id per cell in the columns in order to have only one id per line | 184 #create new lines if there's more than one id per cell in the columns in order |
| 155 one_id_one_line <-function(tab,ncol){ | 185 # to have only one id per line |
| 156 | 186 one_id_one_line <- function(tab, ncol) { |
| 157 if (ncol(tab)>1){ | 187 |
| 158 | 188 if (ncol(tab) > 1) { |
| 159 tab[,ncol] = sapply(tab[,ncol],function(x) gsub("[[:blank:]]","",x)) | 189 |
| 160 header=colnames(tab) | 190 tab[, ncol] <- sapply(tab[, ncol], function(x) gsub("[[:blank:]]", "", x)) |
| 161 res=as.data.frame(matrix(ncol=ncol(tab),nrow=0)) | 191 header <- colnames(tab) |
| 162 for (i in 1:nrow(tab) ) { | 192 res <- as.data.frame(matrix(ncol = ncol(tab), nrow = 0)) |
| 163 lines = split_ids_per_line(tab[i,],ncol) | 193 for (i in seq_len(nrow(tab))) { |
| 164 res = rbind(res,lines) | 194 lines <- split_ids_per_line(tab[i, ], ncol) |
| 165 } | 195 res <- rbind(res, lines) |
| 166 }else { | 196 } |
| 167 res = unlist(sapply(tab[,1],function(x) strsplit(x,";")),use.names = F) | 197 } else { |
| 168 res = data.frame(res[which(!is.na(res[res!=""]))],stringsAsFactors = F) | 198 res <- unlist(sapply(tab[, 1], function(x) strsplit(x, ";")), use.names = F) |
| 169 colnames(res)=colnames(tab) | 199 res <- data.frame(res[which(!is.na(res[res != ""]))], stringsAsFactors = F) |
| 200 colnames(res) <- colnames(tab) | |
| 170 } | 201 } |
| 171 return(res) | 202 return(res) |
| 172 } | 203 } |
| 173 | 204 |
| 174 get_limit <- function(mat) { | 205 get_limit <- function(mat) { |
| 175 min = min(apply(mat,2,min,na.rm=TRUE)) | 206 min <- min(apply(mat, 2, min, na.rm = TRUE)) |
| 176 max = max(apply(mat,2,max,na.rm=TRUE)) | 207 max <- max(apply(mat, 2, max, na.rm = TRUE)) |
| 177 return(c(min,max)) | 208 return(c(min, max)) |
| 178 } | 209 } |
| 179 | 210 |
| 180 check_pathway_ids<- function(pathways_ids) { | 211 check_pathway_ids <- function(pathways_ids) { |
| 181 problematic_pathways <- c("04215","04723") | 212 problematic_pathways <- c("04215", "04723") |
| 182 to_remove <- intersect(pathways_ids,problematic_pathways) | 213 to_remove <- intersect(pathways_ids, problematic_pathways) |
| 183 if (length(to_remove) > 0){ print(paste("Pathway(s)",to_remove,"have been remove from input")) } | 214 if (length(to_remove) > 0) { |
| 215 print(paste("Pathway(s)", to_remove, "have been remove from input")) | |
| 216 } | |
| 184 pathways_ids <- pathways_ids[which(!pathways_ids %in% problematic_pathways)] | 217 pathways_ids <- pathways_ids[which(!pathways_ids %in% problematic_pathways)] |
| 185 if (length(pathways_ids) == 0){stop("No pathways ids to process")} | 218 if (length(pathways_ids) == 0) { |
| 186 return (pathways_ids) | 219 stop("No pathways ids to process") |
| 187 } | 220 } |
| 188 | 221 return(pathways_ids) |
| 189 get_args <- function(){ | 222 } |
| 190 | 223 |
| 224 get_args <- function() { | |
| 225 | |
| 191 ## Collect arguments | 226 ## Collect arguments |
| 192 args <- commandArgs(TRUE) | 227 args <- commandArgs(TRUE) |
| 193 | 228 |
| 194 ## Default setting when no arguments passed | 229 ## Default setting when no arguments passed |
| 195 if(length(args) < 1) { | 230 if (length(args) < 1) { |
| 196 args <- c("--help") | 231 args <- c("--help") |
| 197 } | 232 } |
| 198 | 233 |
| 199 ## Help section | 234 ## Help section |
| 200 if("--help" %in% args) { | 235 if ("--help" %in% args) { |
| 201 cat("Pathview R script | 236 cat("Pathview R script |
| 202 Arguments: | 237 Arguments: |
| 203 --help Print this test | 238 --help Print this test |
| 204 --input path of the input file (must contains a colum of uniprot and/or geneID accession number) | 239 --input path of the input file (must contains a colum of |
| 240 uniprot and/or geneid accession number) | |
| 205 --id_list list of ids to use, ',' separated | 241 --id_list list of ids to use, ',' separated |
| 206 --pathways_id Id(s) of pathway(s) to use, if several, semicolon separated list : hsa00010;hsa05412 | 242 --pathways_id Id(s) of pathway(s) to use, if several, semicolon |
| 207 --id_type Type of accession number ('uniprotID' or 'geneID') | 243 separated list : hsa00010;hsa05412 |
| 208 --id_column Column containing accesion number of interest (ex : 'c1') | 244 --id_type Type of accession number ('uniprotid' or 'geneid') |
| 245 --id_column Column containing accesion number of interest | |
| 246 (ex : 'c1') | |
| 209 --header Boolean, TRUE if header FALSE if not | 247 --header Boolean, TRUE if header FALSE if not |
| 210 --output Output filename | 248 --output Output filename |
| 211 --fold_change_col Column(s) containing fold change values (comma separated) | 249 --fold_change_col Column(s) containing fold change values |
| 250 (comma separated) | |
| 212 --native_kegg TRUE : native KEGG graph, FALSE : Graphviz graph | 251 --native_kegg TRUE : native KEGG graph, FALSE : Graphviz graph |
| 213 --species KEGG species (hsa, mmu, ...) | 252 --species KEGG species (hsa, mmu, ...) |
| 214 --pathways_input Tab with pathways in a column, output format of find_pathways | 253 --pathways_input Tab with pathways in a column, output format of |
| 254 find_pathways | |
| 215 --pathway_col Column of pathways to use | 255 --pathway_col Column of pathways to use |
| 216 --header2 Boolean, TRUE if header FALSE if not | 256 --header2 Boolean, TRUE if header FALSE if not |
| 217 --pathways_list path of file containg the species pathways list (hsa_pathways.loc, mmu_pathways.loc, ...) | 257 --pathways_list path of file containg the species pathways list |
| 258 (hsa_pathways.loc, mmu_pathways.loc, ...) | |
| 218 | 259 |
| 219 Example: | 260 Example: |
| 220 ./PathView.R --input 'input.csv' --pathway_id '05412' --id_type 'uniprotID' --id_column 'c1' --header TRUE \n\n") | 261 ./PathView.R --input 'input.csv' --pathway_id '05412' |
| 221 | 262 --id_type 'uniprotid' --id_column 'c1' --header TRUE \n\n") |
| 222 q(save="no") | 263 |
| 223 } | 264 q(save = "no") |
| 224 | 265 } |
| 225 parseArgs <- function(x) strsplit(sub("^--", "", x), "=") | 266 |
| 226 argsDF <- as.data.frame(do.call("rbind", parseArgs(args))) | 267 parseargs <- function(x) strsplit(sub("^--", "", x), "=") |
| 227 args <- as.list(as.character(argsDF$V2)) | 268 argsdf <- as.data.frame(do.call("rbind", parseargs(args))) |
| 228 names(args) <- argsDF$V1 | 269 args <- as.list(as.character(argsdf$V2)) |
| 229 | 270 names(args) <- argsdf$V1 |
| 271 | |
| 230 return(args) | 272 return(args) |
| 231 } | 273 } |
| 232 | 274 |
| 233 main <- function(){ | 275 main <- function() { |
| 234 | 276 |
| 235 args <- get_args() | 277 args <- get_args() |
| 236 | 278 |
| 237 #save(args,file="/home/dchristiany/proteore_project/ProteoRE/tools/kegg_maps_visualization/args.Rda") | 279 #save(args,file="/home/dchristiany/proteore_project/ProteoRE/tools/ |
| 238 #load("/home/dchristiany/proteore_project/ProteoRE/tools/kegg_maps_visualization/args.Rda") | 280 #kegg_maps_visualization/args.Rda") |
| 239 | 281 #load("/home/dchristiany/proteore_project/ProteoRE/tools/ |
| 282 #kegg_maps_visualization/args.Rda") | |
| 283 | |
| 240 ###setting variables | 284 ###setting variables |
| 241 if (!is.null(args$pathways_id)) { | 285 if (!is.null(args$pathways_id)) { |
| 242 ids <- get_list_from_cp(clean_bad_character(args$pathways_id)) | 286 ids <- get_list_from_cp(clean_bad_character(args$pathways_id)) |
| 243 ids <- sapply(ids, function(x) remove_kegg_prefix(x),USE.NAMES = FALSE) | 287 ids <- sapply(ids, function(x) remove_kegg_prefix(x), USE.NAMES = FALSE) |
| 244 }else if (!is.null(args$pathways_input)){ | 288 }else if (!is.null(args$pathways_input)) { |
| 245 header2 <- str2bool(args$header2) | 289 header2 <- str2bool(args$header2) |
| 246 pathway_col <- as.numeric(gsub("c", "" ,args$pathway_col)) | 290 pathway_col <- as.numeric(gsub("c", "", args$pathway_col)) |
| 247 pathways_file = read_file(args$pathways_input,header2) | 291 pathways_file <- read_file(args$pathways_input, header2) |
| 248 ids <- sapply(rapply(strsplit(clean_bad_character(pathways_file[,pathway_col]),","),c), function(x) remove_kegg_prefix(x),USE.NAMES = FALSE) | 292 ids <- sapply(rapply(strsplit(clean_bad_character( |
| 249 } | 293 pathways_file[, pathway_col]), ","), c), |
| 250 if (args$native_kegg) {ids = check_pathway_ids(ids)} #remove problematic pathways | 294 function(x) remove_kegg_prefix(x), USE.NAMES = FALSE) |
| 251 pathways_list <- read_file(args$pathways_list,F) | 295 } |
| 296 if (args$native_kegg) { | |
| 297 ids <- check_pathway_ids(ids) | |
| 298 } #remove problematic pathways | |
| 299 pathways_list <- read_file(args$pathways_list, F) | |
| 252 if (!is.null(args$id_list)) { | 300 if (!is.null(args$id_list)) { |
| 253 id_list <- get_list_from_cp(args$id_list) | 301 id_list <- get_list_from_cp(args$id_list) |
| 254 } | 302 } |
| 255 id_type <- tolower(args$id_type) | 303 id_type <- tolower(args$id_type) |
| 256 ncol <- as.numeric(gsub("c", "" ,args$id_column)) | 304 ncol <- as.numeric(gsub("c", "", args$id_column)) |
| 257 header <- str2bool(args$header) | 305 header <- str2bool(args$header) |
| 258 native_kegg <- str2bool(args$native_kegg) | 306 native_kegg <- str2bool(args$native_kegg) |
| 259 species=args$species | 307 species <- args$species |
| 260 fold_change_data = str2bool(args$fold_change_data) | 308 fold_change_data <- str2bool(args$fold_change_data) |
| 261 | 309 |
| 262 #org list used in mapped2geneID | 310 #org list used in mapped2geneid |
| 263 org <- c('Hs','Mm','Rn') | 311 org <- c("Hs", "Mm", "Rn") |
| 264 names(org) <- c('hsa','mmu','rno') | 312 names(org) <- c("hsa", "mmu", "rno") |
| 265 | 313 |
| 266 #read input file or list | 314 #read input file or list |
| 267 if (!is.null(args$input)){ | 315 if (!is.null(args$input)) { |
| 268 tab <- read_file(args$input,header) | 316 tab <- read_file(args$input, header) |
| 269 tab <- data.frame(tab[which(tab[ncol]!=""),],stringsAsFactors = F) | 317 tab <- data.frame(tab[which(tab[ncol] != ""), ], stringsAsFactors = F) |
| 270 tab = one_id_one_line(tab,ncol) | 318 tab <- one_id_one_line(tab, ncol) |
| 271 } else { | 319 } else { |
| 272 id_list = gsub("[[:blank:]]|\u00A0|NA","",id_list) | 320 id_list <- gsub("[[:blank:]]|\u00A0|NA", "", id_list) |
| 273 id_list = unique(id_list[id_list!=""]) | 321 id_list <- unique(id_list[id_list != ""]) |
| 274 tab <- data.frame(id_list,stringsAsFactors = F) | 322 tab <- data.frame(id_list, stringsAsFactors = F) |
| 275 ncol=1 | 323 ncol <- 1 |
| 276 } | 324 } |
| 277 | 325 |
| 278 | 326 |
| 279 ##### map uniprotID to entrez geneID and kegg to geneID | 327 ##### map uniprotid to entrez geneid and kegg to geneid |
| 280 uniprotID="" | 328 uniprotid <- "" |
| 281 mapped2geneID="" | 329 mapped2geneid <- "" |
| 282 if (id_type == "uniprotid") { | 330 if (id_type == "uniprotid") { |
| 283 uniprotID=tab[,ncol] | 331 uniprotid <- tab[, ncol] |
| 284 mapped2geneID = id2eg(ids = uniprotID, category = "uniprot", org = org[[species]], pkg.name = NULL) | 332 mapped2geneid <- id2eg(ids = uniprotid, category = "uniprot", |
| 285 geneID = mapped2geneID[,2] | 333 org = org[[species]], pkg.name = NULL) |
| 286 tab = cbind(tab,geneID) | 334 geneid <- mapped2geneid[, 2] |
| 287 ncol=ncol(tab) | 335 tab <- cbind(tab, geneid) |
| 288 }else if (id_type == "keggid"){ | 336 ncol <- ncol(tab) |
| 289 keggID = tab[,ncol] | 337 }else if (id_type == "keggid") { |
| 290 geneID = kegg_to_geneID(keggID) | 338 keggid <- tab[, ncol] |
| 291 tab = cbind(tab,geneID) | 339 geneid <- kegg_to_geneid(keggid) |
| 292 ncol=ncol(tab) | 340 tab <- cbind(tab, geneid) |
| 293 }else if (id_type == "geneid"){ | 341 ncol <- ncol(tab) |
| 294 colnames(tab)[ncol] <- "geneID" | 342 }else if (id_type == "geneid") { |
| 295 } | 343 colnames(tab)[ncol] <- "geneid" |
| 296 | 344 } |
| 345 | |
| 297 ##### build matrix to map on KEGG pathway (kgml : KEGG xml) | 346 ##### build matrix to map on KEGG pathway (kgml : KEGG xml) |
| 298 geneID_indices = which(!is.na(tab$geneID)) | 347 geneid_indices <- which(!is.na(tab$geneid)) |
| 299 if (fold_change_data) { | 348 if (fold_change_data) { |
| 300 fold_change <- as.integer(unlist(strsplit(gsub("c","",args$fold_change_col),","))) | 349 fold_change <- as.integer(unlist(strsplit(gsub("c", "", |
| 301 if (length(fold_change) > 3) { fold_change= fold_change[1:3] } | 350 args$fold_change_col), ","))) |
| 302 if (length(fold_change)==1){ | 351 if (length(fold_change) > 3) { |
| 303 tab[,fold_change] <- as.double(gsub(",",".",as.character(tab[,fold_change]) )) | 352 fold_change <- fold_change[1:3] |
| 353 } | |
| 354 if (length(fold_change) == 1) { | |
| 355 tab[, fold_change] <- as.double(gsub( | |
| 356 ",", ".", as.character(tab[, fold_change]))) | |
| 304 } else { | 357 } else { |
| 305 tab[,fold_change] <- apply(tab[,fold_change],2,function(x) as.double(gsub(",",".",as.character(x)))) | 358 tab[, fold_change] <- apply(tab[, fold_change], 2, |
| 306 } | 359 function(x) as.double(gsub(",", ".", as.character(x)))) |
| 307 mat = tab[geneID_indices,c(ncol,fold_change)] | 360 } |
| 308 mat = mat[(!duplicated(mat$geneID)),] | 361 mat <- tab[geneid_indices, c(ncol, fold_change)] |
| 309 geneID=mat$geneID | 362 mat <- mat[(!duplicated(mat$geneid)), ] |
| 310 mat = as.data.frame(mat[,-1]) | 363 geneid <- mat$geneid |
| 311 row.names(mat)=geneID | 364 mat <- as.data.frame(mat[, -1]) |
| 312 limit = get_limit(mat) | 365 row.names(mat) <- geneid |
| 366 limit <- get_limit(mat) | |
| 313 } else { | 367 } else { |
| 314 mat = unique(as.character(tab$geneID[!is.na(tab$geneID[tab$geneID!=""])])) | 368 mat <- unique(as.character( |
| 315 geneID=mat | 369 tab$geneid[!is.na(tab$geneid[tab$geneid != ""])])) |
| 316 limit=1 | 370 geneid <- mat |
| 317 } | 371 limit <- 1 |
| 318 | 372 } |
| 319 #####mapping geneID (with or without expression values) on KEGG pathway | 373 |
| 320 plot.col.key= TRUE | 374 #####mapping geneid (with or without expression values) on KEGG pathway |
| 321 low_color = "green" | 375 plot_col_key <- TRUE |
| 322 mid_color = "#F3F781" #yellow | 376 low_color <- "green" |
| 323 high_color = "red" | 377 mid_color <- "#F3F781" #yellow |
| 378 high_color <- "red" | |
| 324 if (!fold_change_data) { | 379 if (!fold_change_data) { |
| 325 plot.col.key= FALSE #if there's no exrepession data, we don't show the color key | 380 plot_col_key <- FALSE |
| 326 high_color = "#81BEF7" #blue | 381 #if there's no exrepession data, we don't show the color key |
| 327 } | 382 high_color <- "#81BEF7" #blue |
| 328 | 383 } |
| 384 | |
| 329 #create graph(s) and text output | 385 #create graph(s) and text output |
| 330 for (id in ids) { | 386 for (id in ids) { |
| 331 suffix= get_suffix(pathways_list,species,id) | 387 suffix <- get_suffix(pathways_list, species, id) |
| 332 pv.out <- suppressMessages(pathview(gene.data = mat, | 388 pv_out <- suppressMessages(pathview(gene.data = mat, |
| 333 gene.idtype = "entrez", | 389 gene.idtype = "entrez", |
| 334 pathway.id = id, | 390 pathway.id = id, |
| 335 species = species, | 391 species = species, |
| 336 kegg.dir = ".", | 392 kegg.dir = ".", |
| 337 out.suffix=suffix, | 393 out.suffix = suffix, |
| 338 kegg.native = native_kegg, | 394 kegg.native = native_kegg, |
| 339 low = list(gene = low_color, cpd = "blue"), | 395 low = list(gene = low_color, cpd = "blue"), |
| 340 mid = list(gene = mid_color, cpd = "transparent"), | 396 mid = list(gene = mid_color, cpd = "transparent"), |
| 341 high = list(gene = high_color, cpd = "yellow"), | 397 high = list(gene = high_color, cpd = "yellow"), |
| 342 na.col="#D8D8D8", #gray | 398 na.col = "#D8D8D8", #gray |
| 343 cpd.data=NULL, | 399 cpd.data = NULL, |
| 344 plot.col.key = plot.col.key, | 400 plot_col_key = plot_col_key, |
| 345 pdf.size=c(9,9), | 401 pdf.size = c(9, 9), |
| 346 limit=list(gene=limit, cpd=limit))) | 402 limit = list(gene = limit, cpd = limit))) |
| 347 | 403 |
| 348 if (is.list(pv.out)){ | 404 if (is.list(pv_out)) { |
| 349 | 405 |
| 350 #creating text file | 406 #creating text file |
| 351 if (!exists("DF")) { | 407 if (!exists("df")) { |
| 352 DF <- data.frame(t(mapping_summary(pv.out,species,id,id_type,pathways_list,geneID,uniprotID,mapped2geneID)),stringsAsFactors = F,check.names = F) | 408 df <- data.frame(t(mapping_summary(pv_out, species, id, id_type, |
| 409 pathways_list, geneid, uniprotid, mapped2geneid)), | |
| 410 stringsAsFactors = F, check.names = F) | |
| 353 } else { | 411 } else { |
| 354 #print (mapping_summary(pv.out,species,id)) | 412 df <- rbind(df, data.frame(t(mapping_summary(pv_out, species, id, |
| 355 DF <- rbind(DF,data.frame(t(mapping_summary(pv.out,species,id,id_type,pathways_list,geneID,uniprotID,mapped2geneID)),stringsAsFactors = F,check.names = F)) | 413 id_type, pathways_list, geneid, uniprotid, mapped2geneid)), |
| 414 stringsAsFactors = F, check.names = F)) | |
| 356 } | 415 } |
| 357 } | 416 } |
| 358 } | 417 } |
| 359 | 418 |
| 360 DF <- as.data.frame(apply(DF, c(1,2), function(x) gsub("^$|^ $", NA, x))) #convert "" et " " to NA | 419 df <- as.data.frame(apply(df, c(1, 2), |
| 361 | 420 function(x) gsub("^$|^ $", NA, x))) #convert "" et " " to NA |
| 421 | |
| 362 #text file output | 422 #text file output |
| 363 write.table(DF,file=args$output,quote=FALSE, sep='\t',row.names = FALSE, col.names = TRUE) | 423 write.table(df, file = args$output, quote = FALSE, sep = "\t", |
| 424 row.names = FALSE, col.names = TRUE) | |
| 364 } | 425 } |
| 365 | 426 |
| 366 main() | 427 main() |
