Mercurial > repos > mvdbeek > r_goseq_1_22_0
view getgo.r @ 10:f7f3f7db2d4a draft default tip
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit 6574ffcb63770ff8de2d496894292cb7ce0492a8-dirty
author | mvdbeek |
---|---|
date | Thu, 31 Mar 2016 12:30:01 -0400 |
parents | 04b9c519d3e1 |
children |
line wrap: on
line source
options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) # we need that to not crash galaxy with an UTF8 error on German LC settings. loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") suppressPackageStartupMessages({ library("goseq") library("optparse") library("reshape2") }) sink(stdout(), type = "message") option_list <- list( make_option(c("-p", "--package"), type="character", help = "Genome [used for looking up GO categories]"), make_option(c("-i", "--gene_id"), type="character", help="Gene ID format"), make_option(c("-c", "--cats"), type="character", help="Comma-seperated list of categories to fetch"), make_option(c("-o", "--output"), type="character", help="Path to output file") ) parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) args = parse_args(parser) # vars package = args$package gene_id = args$gene_id output = args$output cats = unlist(strsplit(args$cats, ',')) get_categories = function(package_str, gen, cat) { # gen should be ENSEMBL, UNIGENE, REFSEQ, SYMBOL or GENENAME # package should be org.Xx.eg.db # cat should be PMID, GO2ALLEGS, ENZYME or PATH library(package_str, character.only = TRUE) package = eval( parse( text=package_str ) ) if( cat %in% c("GO2ALLEGS", "GO2ALLTAIRS", "GO2ALLORFS") ) { cat = "GOALL" } if(package_str == "org.Pf.plasmo.db") { keytype = "ORF" } else if(package_str == "org.At.tair.db") { keytype = "TAIR" } else { keytype = "ENTREZID" } entrez_cat = select(package, keys(package), cat, keytype) entrez_cat = entrez_cat[complete.cases(entrez_cat),] if( cat != "GOALL" ) { # add the origin of the term, so that there are no duplicate values e.g between ENZYME and PATH entrez_cat[,2] = sapply(entrez_cat[,2], function(x) paste(cat, x, sep=":")) } else { entrez_cat = entrez_cat[,c(1,2)] # we are discarding ontology (MF, CC, BP) and evidence class here } colnames(entrez_cat) = c(gen, "category") if( gen == "ENTREZ" ) { return( entrez_cat ) } else { # We map ENTREZ to `gen`, but are potentially loosing gene identifiers where multiple identifiers match a single ENTREZ gene id. entrez_cat[,1] = mapIds(package, keys=as.character(entrez_cat[,1]), keytype=keytype, column=gen, multiVals="first") entrez_cat = entrez_cat[complete.cases(entrez_cat),] return(entrez_cat) } } result = lapply( cats, function(x) get_categories(package, gene_id, x ) ) result = do.call(rbind, result) write.table(result, output, sep="\t", col.names = FALSE, row.names = FALSE, quote = FALSE) sessionInfo()