diff GO-enrich.R @ 0:076349b72690 draft

planemo upload commit 2e441b4969ae7cf9aeb227a1d47c43ef7268a5e6-dirty
author proteore
date Wed, 22 Aug 2018 10:44:46 -0400
parents
children 91b9b48d07b3
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/GO-enrich.R	Wed Aug 22 10:44:46 2018 -0400
@@ -0,0 +1,226 @@
+suppressMessages(library(clusterProfiler))
+
+#library(org.Sc.sgd.db)
+suppressMessages(library(org.Hs.eg.db))
+suppressMessages(library(org.Mm.eg.db))
+
+# Read file and return file content as data.frame
+readfile = function(filename, header) {
+  if (header == "true") {
+    # Read only first line of the file as header:
+    headers <- read.table(filename, nrows = 1, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE, na.strings=c("", "NA"), blank.lines.skip = TRUE, quote = "")
+    #Read the data of the files (skipping the first row)
+    file <- read.table(filename, skip = 1, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE, na.strings=c("", "NA"), blank.lines.skip = TRUE, quote = "")
+    # Remove empty rows
+    file <- file[!apply(is.na(file) | file == "", 1, all), , drop=FALSE]
+    #And assign the header to the data
+    names(file) <- headers
+  }
+  else {
+    file <- read.table(filename, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE, na.strings=c("", "NA"), blank.lines.skip = TRUE, quote = "")
+    # Remove empty rows
+    file <- file[!apply(is.na(file) | file == "", 1, all), , drop=FALSE]
+  }
+  return(file)
+}
+
+repartition.GO <- function(geneid, orgdb, ontology, level=3, readable=TRUE) {
+  ggo<-groupGO(gene=geneid, 
+               OrgDb = orgdb, 
+               ont=ontology, 
+               level=level, 
+               readable=TRUE)
+  name <- paste("GGO.", ontology, ".png", sep = "")
+  png(name)
+  p <- barplot(ggo, showCategory=10)
+  print(p)
+  dev.off()
+  return(ggo)
+}
+
+# GO over-representation test
+enrich.GO <- function(geneid, universe, orgdb, ontology, pval_cutoff, qval_cutoff) {
+  ego<-enrichGO(gene=geneid,
+                universe=universe,
+                OrgDb=orgdb,
+                keytype="ENTREZID",
+                ont=ontology,
+                pAdjustMethod="BH",
+                pvalueCutoff=pval_cutoff,
+                qvalueCutoff=qval_cutoff,
+                readable=TRUE)
+  # Plot bar & dot plots
+  bar_name <- paste("EGO.", ontology, ".bar.png", sep = "")
+  png(bar_name)
+  p <- barplot(ego)
+  print(p)
+  dev.off()
+  dot_name <- paste("EGO.", ontology, ".dot.png", sep = "")
+  png(dot_name)
+  p <- dotplot(ego, showCategory=10)
+  print(p)
+  dev.off()
+  return(ego)
+}
+
+check_ids <- function(vector,type) {
+  uniprot_pattern = "^([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2})$"
+  entrez_id = "^'[0-9]+|[A-Z]{1,2}_[0-9]+|[A-Z]{1,2}_[A-Z]{1,4}[0-9]+)$"
+  if (type == "entrez")
+    return(grepl(entrez_id,vector))
+  else if (type == "uniprot") {
+    return(grepl(uniprot_pattern,vector))
+  }
+}
+
+clusterProfiler = function() {
+  args <- commandArgs(TRUE)
+  if(length(args)<1) {
+    args <- c("--help")
+  }
+  
+  # Help section
+  if("--help" %in% args) {
+    cat("clusterProfiler Enrichment Analysis
+    Arguments:
+        --input_type: type of input (list of id or filename)
+        --input: input
+        --ncol: the column number which contains list of input IDs
+        --header: true/false if your file contains a header
+        --id_type: the type of input IDs (UniProt/EntrezID)
+        --universe_type: list or filename
+        --universe: background IDs list
+        --uncol: the column number which contains background IDs list
+        --uheader: true/false if the background IDs file contains header
+        --universe_id_type: the type of universe IDs (UniProt/EntrezID)
+        --species
+        --onto_opt: ontology options
+        --go_function: groupGO/enrichGO
+        --level: 1-3
+        --pval_cutoff
+        --qval_cutoff
+        --text_output: text output filename \n")
+    q(save="no")
+  }
+  # Parse arguments
+  parseArgs <- function(x) strsplit(sub("^--", "", x), "=")
+  argsDF <- as.data.frame(do.call("rbind", parseArgs(args)))
+  args <- as.list(as.character(argsDF$V2))
+  names(args) <- argsDF$V1
+  #print(args)
+  
+  #save(args,file="args.Rda")
+  load("/home/dchristiany/proteore_project/ProteoRE/tools/cluster_profiler/args.Rda")
+  
+  # Extract OrgDb
+  if (args$species=="human") {
+    orgdb<-org.Hs.eg.db
+  } else if (args$species=="mouse") {
+    orgdb<-org.Mm.eg.db
+  } else if (args$species=="rat") {
+    orgdb<-org.Rn.eg.db
+  }
+
+  # Extract input IDs
+  input_type = args$input_type
+  if (input_type == "text") {
+    input = strsplit(args$input, "[ \t\n]+")[[1]]
+  } else if (input_type == "file") {
+    filename = args$input
+    ncol = args$ncol
+    # Check ncol
+    if (! as.numeric(gsub("c", "", ncol)) %% 1 == 0) {
+      stop("Please enter the right format for column number: c[number]")
+    } else {
+      ncol = as.numeric(gsub("c", "", ncol))
+    }
+    header = args$header
+    # Get file content
+    file = readfile(filename, header)
+    # Extract Protein IDs list
+    input =  sapply(as.character(file[,ncol]),function(x) rapply(strsplit(x,";"),c),USE.NAMES = FALSE)
+  }
+  id_type = args$id_type
+  ## Get input gene list from input IDs
+  #ID format Conversion 
+  #This case : from UNIPROT (protein id) to ENTREZ (gene id)
+  #bitr = conversion function from clusterProfiler
+  if (id_type=="Uniprot" & any(check_ids(input,"uniprot"))) {
+    any(check_ids(input,"uniprot"))
+    idFrom<-"UNIPROT"
+    idTo<-"ENTREZID"
+    gene<-bitr(input, fromType=idFrom, toType=idTo, OrgDb=orgdb)
+    gene<-unique(gene$ENTREZID)
+  } else if (id_type=="Entrez" & any(check_ids(input,"entrez"))) {
+    gene<-unique(input)
+  } else {
+    print(paste(id_type,"not found in your ids list, please check your IDs in input or the selected column of your input file"))
+    stop()
+  }
+
+  ontology <- strsplit(args$onto_opt, ",")[[1]]
+  ## Extract GGO/EGO arguments
+  if (args$go_represent == "true") {
+    go_represent <- args$go_represent
+    level <- as.numeric(args$level)
+  }
+  if (args$go_enrich == "true") {
+    go_enrich <- args$go_enrich
+    pval_cutoff <- as.numeric(args$pval_cutoff)
+    qval_cutoff <- as.numeric(args$qval_cutoff)
+    # Extract universe background genes (same as input file)
+    if (!is.null(args$universe_type)) {
+      universe_type = args$universe_type
+      if (universe_type == "text") {
+        universe = strsplit(args$universe, "[ \t\n]+")[[1]]
+      } else if (universe_type == "file") {
+        universe_filename = args$universe
+        universe_ncol = args$uncol
+        # Check ncol
+        if (! as.numeric(gsub("c", "", universe_ncol)) %% 1 == 0) {
+          stop("Please enter the right format for column number: c[number]")
+        } else {
+          universe_ncol = as.numeric(gsub("c", "", universe_ncol))
+        }
+        universe_header = args$uheader
+        # Get file content
+        universe_file = readfile(universe_filename, universe_header)
+        # Extract Protein IDs list
+        universe <- sapply(universe_file[,universe_ncol], function(x) rapply(strsplit(x,";"),c),USE.NAMES = FALSE)
+      }
+      universe_id_type = args$universe_id_type
+      ##to initialize
+      if (universe_id_type=="Uniprot" & any(check_ids(universe,"uniprot"))) {
+        idFrom<-"UNIPROT"
+        idTo<-"ENTREZID"
+        universe_gene<-bitr(universe, fromType=idFrom, toType=idTo, OrgDb=orgdb)
+        universe_gene<-unique(universe_gene$ENTREZID)
+      } else if (universe_id_type=="Entrez" & any(check_ids(universe,"entrez"))) {
+        universe_gene<-unique(universe)
+      } else {
+        if (universe_type=="text"){
+          print(paste(universe_id_type,"not found in your background IDs list",sep=" "))
+        } else {
+          print(paste(universe_id_type,"not found in the column",universe_ncol,"of your background IDs file",sep=" "))
+        }
+        universe_gene = NULL
+      } 
+    } else {
+      universe_gene = NULL
+    }
+  }
+
+  ##enrichGO : GO over-representation test
+  for (onto in ontology) {
+    if (args$go_represent == "true") {
+      ggo<-repartition.GO(gene, orgdb, onto, level, readable=TRUE)
+      write.table(ggo, args$text_output, append = TRUE, sep="\t", row.names = FALSE, quote=FALSE)
+    }
+    if (args$go_enrich == "true" & !is.null(universe_gene)) {
+      ego<-enrich.GO(gene, universe_gene, orgdb, onto, pval_cutoff, qval_cutoff)
+      write.table(ego, args$text_output, append = TRUE, sep="\t", row.names = FALSE, quote=FALSE)
+    }
+  }
+}
+
+clusterProfiler()