Mercurial > repos > mvdbeek > r_goseq_1_22_0
diff getgo.r @ 9:04b9c519d3e1 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit 6574ffcb63770ff8de2d496894292cb7ce0492a8
author | mvdbeek |
---|---|
date | Thu, 31 Mar 2016 12:23:45 -0400 |
parents | 0e9424413ab0 |
children |
line wrap: on
line diff
--- a/getgo.r Mon Mar 07 14:35:53 2016 -0500 +++ b/getgo.r Thu Mar 31 12:23:45 2016 -0400 @@ -6,15 +6,13 @@ suppressPackageStartupMessages({ library("goseq") library("optparse") - library("rtracklayer") library("reshape2") }) sink(stdout(), type = "message") option_list <- list( - make_option(c("-gtf", "--gtf"), type="character", help = "Path to GTF file for which to fetch GO data"), - make_option(c("-g", "--genome"), type="character", help = "Genome [used for looking up GO categories]"), + make_option(c("-p", "--package"), type="character", help = "Genome [used for looking up GO categories]"), make_option(c("-i", "--gene_id"), type="character", help="Gene ID format"), make_option(c("-c", "--cats"), type="character", help="Comma-seperated list of categories to fetch"), make_option(c("-o", "--output"), type="character", help="Path to output file") @@ -25,17 +23,48 @@ # vars -gtf = args$gtf -genome = args$genome +package = args$package gene_id = args$gene_id output = args$output cats = unlist(strsplit(args$cats, ',')) -# retrieve and transform data -genes = unique(import.gff(gtf)$gene_id) -go_categories = getgo(genes, genome, gene_id, fetch.cats=cats) -go_categories = goseq:::reversemapping(go_categories) -go_categories = melt(go_categories) +get_categories = function(package_str, gen, cat) { + # gen should be ENSEMBL, UNIGENE, REFSEQ, SYMBOL or GENENAME + # package should be org.Xx.eg.db + # cat should be PMID, GO2ALLEGS, ENZYME or PATH + library(package_str, character.only = TRUE) + package = eval( parse( text=package_str ) ) + if( cat %in% c("GO2ALLEGS", "GO2ALLTAIRS", "GO2ALLORFS") ) { + cat = "GOALL" + } + if(package_str == "org.Pf.plasmo.db") { + keytype = "ORF" + } else if(package_str == "org.At.tair.db") { + keytype = "TAIR" + } else { + keytype = "ENTREZID" + } + entrez_cat = select(package, keys(package), cat, keytype) + entrez_cat = entrez_cat[complete.cases(entrez_cat),] + if( cat != "GOALL" ) { + # add the origin of the term, so that there are no duplicate values e.g between ENZYME and PATH + entrez_cat[,2] = sapply(entrez_cat[,2], function(x) paste(cat, x, sep=":")) + } else { + entrez_cat = entrez_cat[,c(1,2)] # we are discarding ontology (MF, CC, BP) and evidence class here + } + colnames(entrez_cat) = c(gen, "category") + if( gen == "ENTREZ" ) { + return( entrez_cat ) + } else { + # We map ENTREZ to `gen`, but are potentially loosing gene identifiers where multiple identifiers match a single ENTREZ gene id. + entrez_cat[,1] = mapIds(package, keys=as.character(entrez_cat[,1]), keytype=keytype, column=gen, multiVals="first") + entrez_cat = entrez_cat[complete.cases(entrez_cat),] + return(entrez_cat) + } +} -write.table(go_categories, output, sep="\t", col.names = FALSE, row.names = FALSE, quote = FALSE) -sessionInfo() \ No newline at end of file +result = lapply( cats, function(x) get_categories(package, gene_id, x ) ) +result = do.call(rbind, result) + +write.table(result, output, sep="\t", col.names = FALSE, row.names = FALSE, quote = FALSE) +sessionInfo()