Mercurial > repos > proteore > proteore_pathview_mapping

diff PathView.R @ 0:097bb3ed2b4d draft
planemo upload commit 2e441b4969ae7cf9aeb227a1d47c43ef7268a5e6-dirty
author: proteore
date: Wed, 22 Aug 2018 11:30:23 -0400
children: b617d4bbebf8
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/PathView.R	Wed Aug 22 11:30:23 2018 -0400
@@ -0,0 +1,234 @@
+#!/usr/bin/Rscript
+#Rscript made for mapping genesID on KEGG pathway with Pathview package
+#input : csv file containing ids (uniprot or geneID) to map, plus parameters
+#output : KEGG pathway : jpeg or pdf file.
+
+suppressMessages(library("pathview"))
+
+read_file <- function(path,header){
+    file <- try(read.table(path,header=header, sep="\t",stringsAsFactors = FALSE, quote=""),silent=TRUE)
+    if (inherits(file,"try-error")){
+      stop("File not found !")
+    }else{
+      return(file)
+    }
+}
+
+##### fuction to clean and concatenate pathway name (allow more flexibility for user input) 
+concat_string <- function(x){
+  x <- gsub(" - .*","",x)
+  x <- gsub(" ","",x)
+  x <- gsub("-","",x)
+  x <- gsub("_","",x)
+  x <- gsub(",","",x)
+  x <- gsub("\\'","",x)
+  x <- gsub("\\(.*)","",x)
+  x <- gsub("\\/","",x)
+  x <- tolower(x)
+  return(x)
+}
+
+
+get_args <- function(){
+  
+  ## Collect arguments
+  args <- commandArgs(TRUE)
+  
+  ## Default setting when no arguments passed
+  if(length(args) < 1) {
+    args <- c("--help")
+  }
+  
+  ## Help section
+  if("--help" %in% args) {
+    cat("Pathview R script
+    Arguments:
+      --help                  Print this test
+      --input                 path of the input  file (must contains a colum of uniprot and/or geneID accession number)
+      --id_list               list of ids to use, ',' separated
+      --pathways_id            Id(s) of pathway(s) to use, if several, semicolon separated list : hsa00010;hsa05412 
+      --id_type               Type of accession number ('uniprotID' or 'geneID')
+      --id_column             Column containing accesion number of interest (ex : 'c1')
+      --header                Boolean, TRUE if header FALSE if not
+      --ouput                 Output filename
+      --expression_values1    Column containing expression values (first condition)
+      --expression_values2    Column containing expression values (second condition)
+      --expression_values3    Column containing expression values (third condition)
+      --native_kegg           TRUE : native KEGG graph, FALSE : Graphviz graph
+      --species               KEGG species (hsa, mmu, ...)
+
+      Example:
+      ./PathView.R --input 'input.csv' --pathway_id '05412' --id_type 'uniprotID' --id_column 'c1' --header TRUE \n\n")
+    
+    q(save="no")
+  }
+  
+  
+  #save(args,file="/home/dchristiany/proteore_project/ProteoRE/tools/pathview/args.Rda")
+  #load("/home/dchristiany/proteore_project/ProteoRE/tools/pathview/args.Rda")
+  parseArgs <- function(x) strsplit(sub("^--", "", x), "=")
+  argsDF <- as.data.frame(do.call("rbind", parseArgs(args)))
+  args <- as.list(as.character(argsDF$V2))
+  names(args) <- argsDF$V1
+  
+  return(args)
+}
+
+str2bool <- function(x){
+  if (any(is.element(c("t","true"),tolower(x)))){
+    return (TRUE)
+  }else if (any(is.element(c("f","false"),tolower(x)))){
+    return (FALSE)
+  }else{
+    return(NULL)
+  }
+}
+
+is.letter <- function(x) grepl("[[:alpha:]]", x)
+
+#### hsa00010 -> 00010
+remove_kegg_prefix <- function(x){
+  if (is.letter(substr(x,1,3))){
+    x <- substr(x,4,nchar(x))
+  }
+  return(x)
+}
+
+clean_bad_character <- function(string)  {
+  string <- gsub("X","",string)
+  string <- gsub(" ","",string)
+  return(string)
+}
+
+args <- get_args()
+
+###save and load args in rda file for testing
+#save(args,file="/home/dchristiany/proteore_project/ProteoRE/tools/pathview/args.Rda")
+#load("/home/dchristiany/proteore_project/ProteoRE/tools/pathview/args.Rda")
+
+###setting variables
+if (!is.null(args$pathways_id)) { ids <- sapply(rapply(strsplit(clean_bad_character(args$pathways_id),","),c), function(x) remove_kegg_prefix(x),USE.NAMES = FALSE)}
+#if (!is.null(args$pathways_name)) {names <- as.vector(sapply(strsplit(args$pathways_name,","), function(x) concat_string(x),USE.NAMES = FALSE))}
+if (!is.null(args$id_list)) {id_list <- as.vector(strsplit(clean_bad_character(args$id_list),","))}
+id_type <- tolower(args$id_type)
+ncol <- as.numeric(gsub("c", "" ,args$id_column))
+header <- str2bool(args$header)
+#output <- args$output
+native_kegg <- str2bool(args$native_kegg)
+species=args$species
+
+
+#read input file or list
+if (!is.null(args$input)){
+  tab <- read_file(args$input,header)
+  tab <- tab[!apply(is.na(tab) | tab == "", 1, all),] #delete empty rows
+} else {
+  tab <- data.frame(id_list)
+  ncol=1
+}
+
+e1 <- as.numeric(gsub("c", "" ,args$expression_values1))
+if (!is.null(args$expression_values1)) { colnames(tab)[e1] <- "e1" }
+e2 <- as.numeric(gsub("c", "" ,args$expression_values2))
+if (!is.null(args$expression_values2)) { colnames(tab)[e2] <- "e2" }
+e3 <- as.numeric(gsub("c", "" ,args$expression_values3))
+if (!is.null(args$expression_values3)) { colnames(tab)[e3] <- "e3" }
+
+
+##### map uniprotID to entrez geneID
+if (id_type == "uniprotid") {
+  
+  uniprotID = tab[,ncol]
+  mapped2geneID = id2eg(ids = uniprotID, category = "uniprot", org = "Hs", pkg.name = NULL)
+  geneID = mapped2geneID[,2]
+  tab = cbind(tab,geneID)
+
+}else if (id_type == "geneid"){
+
+  colnames(tab)[ncol] <- "geneID"
+
+}
+
+geneID = tab$geneID[which(tab$geneID !="NA")]
+geneID = gsub(" ","",geneID)
+geneID = unlist(strsplit(geneID,"[;]"))
+
+
+#### get hsa pathways list 
+#download.file(url = "http://rest.kegg.jp/link/pathway/hsa", destfile = "/home/dchristiany/proteore_project/ProteoRE/tools/pathview/geneID_to_hsa_pathways.csv") 
+#geneid_hsa_pathways <- read_file(path = "/home/dchristiany/proteore_project/ProteoRE/tools/pathview/geneID_to_hsa_pathways.csv",FALSE)
+#names(geneid_hsa_pathways) <- c("geneID","pathway")
+
+##### build matrix to map on KEGG pathway (kgml : KEGG xml)
+if (!is.null(args$expression_values1)&is.null(args$expression_values2)&is.null(args$expression_values3)){
+  mat <- as.data.frame(cbind(tab$e1)[which(!is.na(tab$geneID)),])
+  row.names(mat) <- tab$geneID[which(!is.na(tab$geneID))]
+} else if (!is.null(args$expression_values1)&!is.null(args$expression_values2)&is.null(args$expression_values3)){
+  mat <- as.data.frame(cbind(tab$e1,tab$e2)[which(!is.na(tab$geneID)),])
+  row.names(mat) <- tab$geneID[which(!is.na(tab$geneID))]
+}else if (!is.null(args$expression_values1)&!is.null(args$expression_values2)&!is.null(args$expression_values3)){
+  mat <- as.data.frame(cbind(tab$e1,tab$e2,tab$e3)[which(!is.na(tab$geneID)),])
+  row.names(mat) <- tab$geneID[which(!is.na(tab$geneID))]
+} else {
+  mat <- geneID
+}
+
+
+#### simulation data test
+#exp1 <- sim.mol.data(mol.type = c("gene", "gene.ko", "cpd")[1], id.type = NULL, species="hsa", discrete = FALSE, nmol = 161, nexp = 1, rand.seed=100)
+#exp2 <- sim.mol.data(mol.type = c("gene", "gene.ko", "cpd")[1], id.type = NULL, species="hsa", discrete = FALSE, nmol = 161, nexp = 1, rand.seed=50)
+#exp3 <- sim.mol.data(mol.type = c("gene", "gene.ko", "cpd")[1], id.type = NULL, species="hsa", discrete = FALSE, nmol = 161, nexp = 1, rand.seed=10)
+#tab <- cbind(tab,exp1,exp2,exp3)
+
+#write.table(tab, file='/home/dchristiany/proteore_project/ProteoRE/tools/pathview/Lacombe_sim_expression_data.tsv', quote=FALSE, sep='\t',row.names = FALSE)
+
+#mat <- exp1[1:nrow(tab)]
+#names(mat) <- geneID
+
+
+#####mapping geneID (with or without expression values) on KEGG pathway
+for (id in ids) {
+  pathview(gene.data = mat,
+           #gene.idtype = "geneID",
+           #cpd.data = uniprotID,
+           #cpd.idtype = "uniprot",
+           pathway.id = id,
+           #pathway.name = "",
+           species = species, 
+           kegg.dir = ".", 
+           gene.idtype = "entrez", 
+           #gene.annotpkg = NULL, 
+           #min.nnodes = 3, 
+           kegg.native = native_kegg,
+           #map.null = TRUE, 
+           #expand.node = FALSE, 
+           #split.group = FALSE, 
+           #map.symbol = TRUE, 
+           #map.cpdname = TRUE, 
+           #node.sum = "sum", 
+           #discrete=list(gene=FALSE,cpd=FALSE), 
+           #limit = list(gene = 1, cpd = 1), 
+           #bins = list(gene = 10, cpd = 10), 
+           #both.dirs = list(gene = T, cpd = T), 
+           #trans.fun = list(gene = NULL, cpd = NULL), 
+           #low = list(gene = "green", cpd = "blue"), 
+           #mid = list(gene = "gray", cpd = "gray"), 
+           #high = list(gene = "red", cpd = "yellow"), 
+           #na.col = "transparent",
+           #sign.pos="bottomleft",
+           #key.pos="topright",
+           #new.signature=TRUE,
+           #rankdir="LB",
+           #cex=0.3,
+           #text.width=15,
+           #res=300,
+           pdf.size=c(9,9))
+           #is.signal=TRUE)
+}
+
+########using keggview.native
+
+#xml.file=system.file("extdata", "hsa00010.xml", package = "pathview")
+#node.data=node.info("/home/dchristiany/hsa00010.xml")
+#plot.data.gene=node.map(mol.data=test, node.data, node.types="gene")
+#colors =node.color(plot.data = plot.data.gene[,1:9])
\ No newline at end of file
author	proteore
date	Wed, 22 Aug 2018 11:30:23 -0400
parents
children	b617d4bbebf8