comparison getgo.r @ 9:04b9c519d3e1 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit 6574ffcb63770ff8de2d496894292cb7ce0492a8
author mvdbeek
date Thu, 31 Mar 2016 12:23:45 -0400
parents 0e9424413ab0
children
comparison
equal deleted inserted replaced
8:fb95db039592 9:04b9c519d3e1
4 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") 4 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
5 5
6 suppressPackageStartupMessages({ 6 suppressPackageStartupMessages({
7 library("goseq") 7 library("goseq")
8 library("optparse") 8 library("optparse")
9 library("rtracklayer")
10 library("reshape2") 9 library("reshape2")
11 }) 10 })
12 11
13 sink(stdout(), type = "message") 12 sink(stdout(), type = "message")
14 13
15 option_list <- list( 14 option_list <- list(
16 make_option(c("-gtf", "--gtf"), type="character", help = "Path to GTF file for which to fetch GO data"), 15 make_option(c("-p", "--package"), type="character", help = "Genome [used for looking up GO categories]"),
17 make_option(c("-g", "--genome"), type="character", help = "Genome [used for looking up GO categories]"),
18 make_option(c("-i", "--gene_id"), type="character", help="Gene ID format"), 16 make_option(c("-i", "--gene_id"), type="character", help="Gene ID format"),
19 make_option(c("-c", "--cats"), type="character", help="Comma-seperated list of categories to fetch"), 17 make_option(c("-c", "--cats"), type="character", help="Comma-seperated list of categories to fetch"),
20 make_option(c("-o", "--output"), type="character", help="Path to output file") 18 make_option(c("-o", "--output"), type="character", help="Path to output file")
21 ) 19 )
22 20
23 parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) 21 parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
24 args = parse_args(parser) 22 args = parse_args(parser)
25 23
26 # vars 24 # vars
27 25
28 gtf = args$gtf 26 package = args$package
29 genome = args$genome
30 gene_id = args$gene_id 27 gene_id = args$gene_id
31 output = args$output 28 output = args$output
32 cats = unlist(strsplit(args$cats, ',')) 29 cats = unlist(strsplit(args$cats, ','))
33 30
34 # retrieve and transform data 31 get_categories = function(package_str, gen, cat) {
35 genes = unique(import.gff(gtf)$gene_id) 32 # gen should be ENSEMBL, UNIGENE, REFSEQ, SYMBOL or GENENAME
36 go_categories = getgo(genes, genome, gene_id, fetch.cats=cats) 33 # package should be org.Xx.eg.db
37 go_categories = goseq:::reversemapping(go_categories) 34 # cat should be PMID, GO2ALLEGS, ENZYME or PATH
38 go_categories = melt(go_categories) 35 library(package_str, character.only = TRUE)
36 package = eval( parse( text=package_str ) )
37 if( cat %in% c("GO2ALLEGS", "GO2ALLTAIRS", "GO2ALLORFS") ) {
38 cat = "GOALL"
39 }
40 if(package_str == "org.Pf.plasmo.db") {
41 keytype = "ORF"
42 } else if(package_str == "org.At.tair.db") {
43 keytype = "TAIR"
44 } else {
45 keytype = "ENTREZID"
46 }
47 entrez_cat = select(package, keys(package), cat, keytype)
48 entrez_cat = entrez_cat[complete.cases(entrez_cat),]
49 if( cat != "GOALL" ) {
50 # add the origin of the term, so that there are no duplicate values e.g between ENZYME and PATH
51 entrez_cat[,2] = sapply(entrez_cat[,2], function(x) paste(cat, x, sep=":"))
52 } else {
53 entrez_cat = entrez_cat[,c(1,2)] # we are discarding ontology (MF, CC, BP) and evidence class here
54 }
55 colnames(entrez_cat) = c(gen, "category")
56 if( gen == "ENTREZ" ) {
57 return( entrez_cat )
58 } else {
59 # We map ENTREZ to `gen`, but are potentially loosing gene identifiers where multiple identifiers match a single ENTREZ gene id.
60 entrez_cat[,1] = mapIds(package, keys=as.character(entrez_cat[,1]), keytype=keytype, column=gen, multiVals="first")
61 entrez_cat = entrez_cat[complete.cases(entrez_cat),]
62 return(entrez_cat)
63 }
64 }
39 65
40 write.table(go_categories, output, sep="\t", col.names = FALSE, row.names = FALSE, quote = FALSE) 66 result = lapply( cats, function(x) get_categories(package, gene_id, x ) )
67 result = do.call(rbind, result)
68
69 write.table(result, output, sep="\t", col.names = FALSE, row.names = FALSE, quote = FALSE)
41 sessionInfo() 70 sessionInfo()