Mercurial > repos > proteore > proteore_maps_visualization
view kegg_maps_visualization.R @ 12:10c572ad2b9e draft default tip
"planemo upload commit 209e9e48505e55a2a2b4b1d7b053f557344bc2e8-dirty"
| author | proteore |
|---|---|
| date | Mon, 10 May 2021 12:45:44 +0000 |
| parents | 7d04efde2526 |
| children |
line wrap: on
line source
#!/usr/bin/Rscript #Rscript made for mapping genesID on KEGG pathway with Pathview package #input : csv file containing ids (uniprot or geneid) to map, plus parameters #output : KEGG pathway : jpeg or pdf file. options(warn = -1) #TURN OFF WARNINGS !!!!!! suppressMessages(library("pathview")) suppressMessages(library(KEGGREST)) read_file <- function(path, header) { file <- try(read.csv(path, header = header, sep = "\t", stringsAsFactors = FALSE, quote = "\"", check.names = F, comment.char = ""), silent = TRUE) if (inherits(file, "try-error")) { stop("Read file error! Please check your file (header, # character, etc)") }else { return(file) } } ##### fuction to clean and concatenate pathway name ##### (allow more flexibility for user input) concat_string <- function(x) { x <- gsub(" - .*", "", x) x <- gsub(" ", "", x) x <- gsub("-", "", x) x <- gsub("_", "", x) x <- gsub(",", "", x) x <- gsub("\\'", "", x) x <- gsub("\\(.*)", "", x) x <- gsub("\\/", "", x) x <- tolower(x) return(x) } #return output suffix (pathway name) from id kegg (ex : hsa:00010) get_suffix <- function(pathways_list, species, id) { suffix <- gsub("/", "or", pathways_list[pathways_list[, 1] == paste( species, id, sep = ""), 2]) suffix <- gsub(" ", "_", suffix) if (nchar(suffix) > 50) { suffix <- substr(suffix, 1, 50) } return(suffix) } str2bool <- function(x) { if (any(is.element(c("t", "true"), tolower(x)))) { return(TRUE) }else if (any(is.element(c("f", "false"), tolower(x)))) { return(FALSE) }else { return(NULL) } } is_letter <- function(x) grepl("[[:alpha:]]", x) remove_kegg_prefix <- function(x) { x <- gsub(":", "", x) if (substr(x, 1, 4) == "path") { x <- substr(x, 5, nchar(x)) } if (is_letter(substr(x, 1, 3))) { x <- substr(x, 4, nchar(x)) } return(x) } kegg_to_geneid <- function(vector) { #nolint start vector <- sapply(vector, function(x) unlist( strsplit(x, ":"))[2], USE.NAMES = F) #nolint end return(vector) } clean_bad_character <- function(string) { string <- gsub("X", "", string) return(string) } get_list_from_cp <- function(list) { list <- gsub(";", "\t", list) list <- gsub(",", "\t", list) list <- strsplit(list, "[ \t\n]+")[[1]] list <- list[list != ""] #remove empty entry list <- gsub("-.+", "", list) #Remove isoform accession number (e.g. "-2") return(list) } get_ref_pathways <- function(species) { ##all available pathways for the species pathways <- keggLink("pathway", species) tot_path <- unique(pathways) ##formating the dat into a list object ##key= pathway ID, value = genes of the pathway in the kegg format pathways_list <- sapply(tot_path, function(pathway) names(which(pathways == pathway))) return(pathways_list) } #nolint start mapping_summary <- function(pv_out, species, id, id_type, pathways_list, geneid, uniprotid, mapped2geneid) { ref_pathways <- get_ref_pathways(species) names(ref_pathways) <- sapply(names(ref_pathways), function(x) gsub("path:[a-z]{3}", "", x), USE.NAMES = F) #nolint end #genes present in pathway genes <- ref_pathways[id][[1]] nb_genes <- length(genes) #genes mapped on pathway genes mapped <- unlist(sapply(pv_out$plot.data.gene$all.mapped, function(x) strsplit(x, ",")), use.names = F) mapped <- unique(mapped[mapped != ""]) nb_mapped <- length(mapped) #compute ratio of mapping ratio <- round((nb_mapped / nb_genes) * 100, 2) if (is.nan(ratio)) { ratio <- "" } pathway_id <- paste(species, id, sep = "") pathway_name <- as.character( pathways_list[pathways_list[, 1] == pathway_id, ][2]) if (id_type == "geneid" || id_type == "keggid") { row <- c(pathway_id, pathway_name, length(unique(geneid)), nb_mapped, nb_genes, ratio, paste(mapped, collapse = ";")) names(row) <- c("KEGG pathway ID", "pathway name", "nb of Entrez gene ID used", "nb of Entrez gene ID mapped", "nb of Entrez gene ID in the pathway", "ratio of Entrez gene ID mapped (%)", "Entrez gene ID mapped") } else if (id_type == "uniprotid") { row <- c(pathway_id, pathway_name, length(unique(uniprotid)), length(unique(geneid)), nb_mapped, nb_genes, ratio, paste(mapped, collapse = ";"), paste(mapped2geneid[which(mapped2geneid[, 2] %in% mapped)], collapse = ";")) names(row) <- c("KEGG pathway ID", "pathway name", "nb of Uniprot_AC used", "nb of Entrez gene ID used", "nb of Entrez gene ID mapped", "nb of Entrez gene ID in the pathway", "ratio of Entrez gene ID mapped (%)", "Entrez gene ID mapped", "uniprot_AC mapped") } return(row) } #take data frame, return data frame split_ids_per_line <- function(line, ncol) { #print (line) header <- colnames(line) line[ncol] <- gsub("[[:blank:]]|\u00A0", "", line[ncol]) if (length(unlist(strsplit(as.character(line[ncol]), ";"))) > 1) { if (length(line) == 1) { lines <- as.data.frame(unlist(strsplit(as.character(line[ncol]), ";")), stringsAsFactors = F) } else { if (ncol == 1) { #first column lines <- suppressWarnings(cbind(unlist(strsplit( as.character(line[ncol]), ";")), line[2:length(line)])) } else if (ncol == length(line)) { #last column lines <- suppressWarnings(cbind(line[1:ncol - 1], unlist(strsplit(as.character(line[ncol]), ";")))) } else { lines <- suppressWarnings(cbind(line[1:ncol - 1], unlist(strsplit(as.character(line[ncol]), ";"), use.names = F), line[(ncol + 1):length(line)])) } } colnames(lines) <- header return(lines) } else { return(line) } } #create new lines if there's more than one id per cell in the columns in order # to have only one id per line one_id_one_line <- function(tab, ncol) { if (ncol(tab) > 1) { tab[, ncol] <- sapply(tab[, ncol], function(x) gsub("[[:blank:]]", "", x)) header <- colnames(tab) res <- as.data.frame(matrix(ncol = ncol(tab), nrow = 0)) for (i in seq_len(nrow(tab))) { lines <- split_ids_per_line(tab[i, ], ncol) res <- rbind(res, lines) } } else { res <- unlist(sapply(tab[, 1], function(x) strsplit(x, ";")), use.names = F) res <- data.frame(res[which(!is.na(res[res != ""]))], stringsAsFactors = F) colnames(res) <- colnames(tab) } return(res) } get_limit <- function(mat) { min <- min(apply(mat, 2, min, na.rm = TRUE)) max <- max(apply(mat, 2, max, na.rm = TRUE)) return(c(min, max)) } check_pathway_ids <- function(pathways_ids) { problematic_pathways <- c("04215", "04723") to_remove <- intersect(pathways_ids, problematic_pathways) if (length(to_remove) > 0) { print(paste("Pathway(s)", to_remove, "have been remove from input")) } pathways_ids <- pathways_ids[which(!pathways_ids %in% problematic_pathways)] if (length(pathways_ids) == 0) { stop("No pathways ids to process") } return(pathways_ids) } get_args <- function() { ## Collect arguments args <- commandArgs(TRUE) ## Default setting when no arguments passed if (length(args) < 1) { args <- c("--help") } ## Help section if ("--help" %in% args) { cat("Pathview R script Arguments: --help Print this test --input path of the input file (must contains a colum of uniprot and/or geneid accession number) --id_list list of ids to use, ',' separated --pathways_id Id(s) of pathway(s) to use, if several, semicolon separated list : hsa00010;hsa05412 --id_type Type of accession number ('uniprotid' or 'geneid') --id_column Column containing accesion number of interest (ex : 'c1') --header Boolean, TRUE if header FALSE if not --output Output filename --fold_change_col Column(s) containing fold change values (comma separated) --native_kegg TRUE : native KEGG graph, FALSE : Graphviz graph --species KEGG species (hsa, mmu, ...) --pathways_input Tab with pathways in a column, output format of find_pathways --pathway_col Column of pathways to use --header2 Boolean, TRUE if header FALSE if not --pathways_list path of file containg the species pathways list (hsa_pathways.loc, mmu_pathways.loc, ...) Example: ./PathView.R --input 'input.csv' --pathway_id '05412' --id_type 'uniprotid' --id_column 'c1' --header TRUE \n\n") q(save = "no") } parseargs <- function(x) strsplit(sub("^--", "", x), "=") argsdf <- as.data.frame(do.call("rbind", parseargs(args))) args <- as.list(as.character(argsdf$V2)) names(args) <- argsdf$V1 return(args) } main <- function() { args <- get_args() #save(args,file="/home/dchristiany/proteore_project/ProteoRE/tools/ #kegg_maps_visualization/args.Rda") #load("/home/dchristiany/proteore_project/ProteoRE/tools/ #kegg_maps_visualization/args.Rda") ###setting variables if (!is.null(args$pathways_id)) { ids <- get_list_from_cp(clean_bad_character(args$pathways_id)) ids <- sapply(ids, function(x) remove_kegg_prefix(x), USE.NAMES = FALSE) }else if (!is.null(args$pathways_input)) { header2 <- str2bool(args$header2) pathway_col <- as.numeric(gsub("c", "", args$pathway_col)) pathways_file <- read_file(args$pathways_input, header2) ids <- sapply(rapply(strsplit(clean_bad_character( pathways_file[, pathway_col]), ","), c), function(x) remove_kegg_prefix(x), USE.NAMES = FALSE) } if (args$native_kegg) { ids <- check_pathway_ids(ids) } #remove problematic pathways pathways_list <- read_file(args$pathways_list, F) if (!is.null(args$id_list)) { id_list <- get_list_from_cp(args$id_list) } id_type <- tolower(args$id_type) ncol <- as.numeric(gsub("c", "", args$id_column)) header <- str2bool(args$header) native_kegg <- str2bool(args$native_kegg) species <- args$species fold_change_data <- str2bool(args$fold_change_data) #org list used in mapped2geneid org <- c("Hs", "Mm", "Rn") names(org) <- c("hsa", "mmu", "rno") #read input file or list if (!is.null(args$input)) { tab <- read_file(args$input, header) tab <- data.frame(tab[which(tab[ncol] != ""), ], stringsAsFactors = F) tab <- one_id_one_line(tab, ncol) } else { id_list <- gsub("[[:blank:]]|\u00A0|NA", "", id_list) id_list <- unique(id_list[id_list != ""]) tab <- data.frame(id_list, stringsAsFactors = F) ncol <- 1 } ##### map uniprotid to entrez geneid and kegg to geneid uniprotid <- "" mapped2geneid <- "" if (id_type == "uniprotid") { uniprotid <- tab[, ncol] mapped2geneid <- id2eg(ids = uniprotid, category = "uniprot", org = org[[species]], pkg.name = NULL) geneid <- mapped2geneid[, 2] tab <- cbind(tab, geneid) ncol <- ncol(tab) }else if (id_type == "keggid") { keggid <- tab[, ncol] geneid <- kegg_to_geneid(keggid) tab <- cbind(tab, geneid) ncol <- ncol(tab) }else if (id_type == "geneid") { colnames(tab)[ncol] <- "geneid" } ##### build matrix to map on KEGG pathway (kgml : KEGG xml) geneid_indices <- which(!is.na(tab$geneid)) if (fold_change_data) { fold_change <- as.integer(unlist(strsplit(gsub("c", "", args$fold_change_col), ","))) if (length(fold_change) > 3) { fold_change <- fold_change[1:3] } if (length(fold_change) == 1) { tab[, fold_change] <- as.double(gsub( ",", ".", as.character(tab[, fold_change]))) } else { tab[, fold_change] <- apply(tab[, fold_change], 2, function(x) as.double(gsub(",", ".", as.character(x)))) } mat <- tab[geneid_indices, c(ncol, fold_change)] mat <- mat[(!duplicated(mat$geneid)), ] geneid <- mat$geneid mat <- as.data.frame(mat[, -1]) row.names(mat) <- geneid limit <- get_limit(mat) } else { mat <- unique(as.character( tab$geneid[!is.na(tab$geneid[tab$geneid != ""])])) geneid <- mat limit <- 1 } #####mapping geneid (with or without expression values) on KEGG pathway plot_col_key <- TRUE low_color <- "green" mid_color <- "#F3F781" #yellow high_color <- "red" if (!fold_change_data) { plot_col_key <- FALSE #if there's no exrepession data, we don't show the color key high_color <- "#81BEF7" #blue } #create graph(s) and text output for (id in ids) { suffix <- get_suffix(pathways_list, species, id) pv_out <- suppressMessages(pathview(gene.data = mat, gene.idtype = "entrez", pathway.id = id, species = species, kegg.dir = ".", out.suffix = suffix, kegg.native = native_kegg, low = list(gene = low_color, cpd = "blue"), mid = list(gene = mid_color, cpd = "transparent"), high = list(gene = high_color, cpd = "yellow"), na.col = "#D8D8D8", #gray cpd.data = NULL, plot_col_key = plot_col_key, pdf.size = c(9, 9), limit = list(gene = limit, cpd = limit))) if (is.list(pv_out)) { #creating text file if (!exists("df")) { df <- data.frame(t(mapping_summary(pv_out, species, id, id_type, pathways_list, geneid, uniprotid, mapped2geneid)), stringsAsFactors = F, check.names = F) } else { df <- rbind(df, data.frame(t(mapping_summary(pv_out, species, id, id_type, pathways_list, geneid, uniprotid, mapped2geneid)), stringsAsFactors = F, check.names = F)) } } } df <- as.data.frame(apply(df, c(1, 2), function(x) gsub("^$|^ $", NA, x))) #convert "" et " " to NA #text file output write.table(df, file = args$output, quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE) } main()
