diff getgo.r @ 9:04b9c519d3e1 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit 6574ffcb63770ff8de2d496894292cb7ce0492a8
author mvdbeek
date Thu, 31 Mar 2016 12:23:45 -0400
parents 0e9424413ab0
children
line wrap: on
line diff
--- a/getgo.r	Mon Mar 07 14:35:53 2016 -0500
+++ b/getgo.r	Thu Mar 31 12:23:45 2016 -0400
@@ -6,15 +6,13 @@
 suppressPackageStartupMessages({
     library("goseq")
     library("optparse")
-    library("rtracklayer")
     library("reshape2")
 })
 
 sink(stdout(), type = "message")
 
 option_list <- list(
-    make_option(c("-gtf", "--gtf"), type="character", help = "Path to GTF file for which to fetch GO data"),
-    make_option(c("-g", "--genome"), type="character", help = "Genome [used for looking up GO categories]"),
+    make_option(c("-p", "--package"), type="character", help = "Genome [used for looking up GO categories]"),
     make_option(c("-i", "--gene_id"), type="character", help="Gene ID format"),
     make_option(c("-c", "--cats"), type="character", help="Comma-seperated list of categories to fetch"),
     make_option(c("-o", "--output"), type="character", help="Path to output file")
@@ -25,17 +23,48 @@
 
 # vars
 
-gtf = args$gtf
-genome = args$genome
+package = args$package
 gene_id = args$gene_id
 output = args$output
 cats = unlist(strsplit(args$cats, ','))
 
-# retrieve and transform data
-genes = unique(import.gff(gtf)$gene_id)
-go_categories = getgo(genes, genome, gene_id, fetch.cats=cats)
-go_categories = goseq:::reversemapping(go_categories)
-go_categories = melt(go_categories)
+get_categories = function(package_str, gen, cat) {
+  # gen should be ENSEMBL, UNIGENE, REFSEQ, SYMBOL or GENENAME
+  # package should be org.Xx.eg.db
+  # cat should be PMID, GO2ALLEGS, ENZYME or PATH
+  library(package_str, character.only = TRUE)
+  package = eval( parse( text=package_str ) )
+  if( cat %in% c("GO2ALLEGS", "GO2ALLTAIRS", "GO2ALLORFS") ) {
+    cat = "GOALL"
+  }
+  if(package_str == "org.Pf.plasmo.db") {
+    keytype = "ORF"
+    } else if(package_str == "org.At.tair.db") {
+    keytype = "TAIR"
+    } else {
+    keytype = "ENTREZID"
+    }
+  entrez_cat = select(package, keys(package), cat, keytype)
+  entrez_cat = entrez_cat[complete.cases(entrez_cat),]
+    if( cat != "GOALL" ) {
+      # add the origin of the term, so that there are no duplicate values e.g between ENZYME and PATH
+      entrez_cat[,2] = sapply(entrez_cat[,2], function(x) paste(cat, x, sep=":"))
+    } else {
+      entrez_cat = entrez_cat[,c(1,2)] # we are discarding ontology (MF, CC, BP) and evidence class here
+    }
+  colnames(entrez_cat) = c(gen, "category")
+  if( gen == "ENTREZ" ) {
+    return( entrez_cat )
+    } else {
+      # We map ENTREZ to `gen`, but are potentially loosing gene identifiers where multiple identifiers match a single ENTREZ gene id.
+      entrez_cat[,1] = mapIds(package, keys=as.character(entrez_cat[,1]), keytype=keytype, column=gen, multiVals="first")
+      entrez_cat = entrez_cat[complete.cases(entrez_cat),]
+      return(entrez_cat)
+    }
+}
 
-write.table(go_categories, output, sep="\t", col.names = FALSE, row.names = FALSE, quote = FALSE)
-sessionInfo()
\ No newline at end of file
+result = lapply( cats, function(x) get_categories(package, gene_id, x ) )
+result = do.call(rbind, result)
+
+write.table(result, output, sep="\t", col.names = FALSE, row.names = FALSE, quote = FALSE)
+sessionInfo()