Mercurial > repos > mvdbeek > r_goseq_1_22_0
comparison getgo.r @ 9:04b9c519d3e1 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit 6574ffcb63770ff8de2d496894292cb7ce0492a8
author | mvdbeek |
---|---|
date | Thu, 31 Mar 2016 12:23:45 -0400 |
parents | 0e9424413ab0 |
children |
comparison
equal
deleted
inserted
replaced
8:fb95db039592 | 9:04b9c519d3e1 |
---|---|
4 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") | 4 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") |
5 | 5 |
6 suppressPackageStartupMessages({ | 6 suppressPackageStartupMessages({ |
7 library("goseq") | 7 library("goseq") |
8 library("optparse") | 8 library("optparse") |
9 library("rtracklayer") | |
10 library("reshape2") | 9 library("reshape2") |
11 }) | 10 }) |
12 | 11 |
13 sink(stdout(), type = "message") | 12 sink(stdout(), type = "message") |
14 | 13 |
15 option_list <- list( | 14 option_list <- list( |
16 make_option(c("-gtf", "--gtf"), type="character", help = "Path to GTF file for which to fetch GO data"), | 15 make_option(c("-p", "--package"), type="character", help = "Genome [used for looking up GO categories]"), |
17 make_option(c("-g", "--genome"), type="character", help = "Genome [used for looking up GO categories]"), | |
18 make_option(c("-i", "--gene_id"), type="character", help="Gene ID format"), | 16 make_option(c("-i", "--gene_id"), type="character", help="Gene ID format"), |
19 make_option(c("-c", "--cats"), type="character", help="Comma-seperated list of categories to fetch"), | 17 make_option(c("-c", "--cats"), type="character", help="Comma-seperated list of categories to fetch"), |
20 make_option(c("-o", "--output"), type="character", help="Path to output file") | 18 make_option(c("-o", "--output"), type="character", help="Path to output file") |
21 ) | 19 ) |
22 | 20 |
23 parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) | 21 parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) |
24 args = parse_args(parser) | 22 args = parse_args(parser) |
25 | 23 |
26 # vars | 24 # vars |
27 | 25 |
28 gtf = args$gtf | 26 package = args$package |
29 genome = args$genome | |
30 gene_id = args$gene_id | 27 gene_id = args$gene_id |
31 output = args$output | 28 output = args$output |
32 cats = unlist(strsplit(args$cats, ',')) | 29 cats = unlist(strsplit(args$cats, ',')) |
33 | 30 |
34 # retrieve and transform data | 31 get_categories = function(package_str, gen, cat) { |
35 genes = unique(import.gff(gtf)$gene_id) | 32 # gen should be ENSEMBL, UNIGENE, REFSEQ, SYMBOL or GENENAME |
36 go_categories = getgo(genes, genome, gene_id, fetch.cats=cats) | 33 # package should be org.Xx.eg.db |
37 go_categories = goseq:::reversemapping(go_categories) | 34 # cat should be PMID, GO2ALLEGS, ENZYME or PATH |
38 go_categories = melt(go_categories) | 35 library(package_str, character.only = TRUE) |
36 package = eval( parse( text=package_str ) ) | |
37 if( cat %in% c("GO2ALLEGS", "GO2ALLTAIRS", "GO2ALLORFS") ) { | |
38 cat = "GOALL" | |
39 } | |
40 if(package_str == "org.Pf.plasmo.db") { | |
41 keytype = "ORF" | |
42 } else if(package_str == "org.At.tair.db") { | |
43 keytype = "TAIR" | |
44 } else { | |
45 keytype = "ENTREZID" | |
46 } | |
47 entrez_cat = select(package, keys(package), cat, keytype) | |
48 entrez_cat = entrez_cat[complete.cases(entrez_cat),] | |
49 if( cat != "GOALL" ) { | |
50 # add the origin of the term, so that there are no duplicate values e.g between ENZYME and PATH | |
51 entrez_cat[,2] = sapply(entrez_cat[,2], function(x) paste(cat, x, sep=":")) | |
52 } else { | |
53 entrez_cat = entrez_cat[,c(1,2)] # we are discarding ontology (MF, CC, BP) and evidence class here | |
54 } | |
55 colnames(entrez_cat) = c(gen, "category") | |
56 if( gen == "ENTREZ" ) { | |
57 return( entrez_cat ) | |
58 } else { | |
59 # We map ENTREZ to `gen`, but are potentially loosing gene identifiers where multiple identifiers match a single ENTREZ gene id. | |
60 entrez_cat[,1] = mapIds(package, keys=as.character(entrez_cat[,1]), keytype=keytype, column=gen, multiVals="first") | |
61 entrez_cat = entrez_cat[complete.cases(entrez_cat),] | |
62 return(entrez_cat) | |
63 } | |
64 } | |
39 | 65 |
40 write.table(go_categories, output, sep="\t", col.names = FALSE, row.names = FALSE, quote = FALSE) | 66 result = lapply( cats, function(x) get_categories(package, gene_id, x ) ) |
67 result = do.call(rbind, result) | |
68 | |
69 write.table(result, output, sep="\t", col.names = FALSE, row.names = FALSE, quote = FALSE) | |
41 sessionInfo() | 70 sessionInfo() |