changeset 5:b79c65c90744 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit 81aedf1b50849160f6c048c0da4bb1038bb813a5
author mvdbeek
date Sun, 28 Feb 2016 11:52:10 -0500
parents 76eab486aba9
children 0e9424413ab0
files getgo.r getgo.xml goseq.r goseq.xml tool_dependencies.xml
diffstat 5 files changed, 93 insertions(+), 5 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/getgo.r	Sun Feb 28 11:52:10 2016 -0500
@@ -0,0 +1,35 @@
+suppressWarnings(suppressMessages(library(goseq)))
+suppressWarnings(suppressMessages(library(optparse)))
+suppressWarnings(suppressMessages(library(rtracklayer)))
+suppressWarnings(suppressMessages(library(reshape2)))
+sink(stdout(), type = "message")
+
+option_list <- list(
+    make_option(c("-gtf", "--gtf"), type="character", help = "Path to GTF file for which to fetch GO data"),
+    make_option(c("-g", "--genome"), type="character", help = "Genome [used for looking up GO categories]"),
+    make_option(c("-i", "--gene_id"), type="character", help="Gene ID format"),
+    make_option(c("-c", "--cats"), type="character", help="Comma-seperated list of categories to fetch"),
+    make_option(c("-o", "--output"), type="character", help="Path to output file")
+)
+
+parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
+args = parse_args(parser)
+
+# Vars:
+
+gtf = args$gtf
+genome = args$genome
+gene_id = args$gene_id
+output = args$output
+cats = unlist(strsplit(args$cats, ','))
+genes = unique(import.gff(gtf)$gene_id)
+go_categories = getgo(genes, genome, id, fetch.cats=cats)
+
+# transform go category list to sth. more manipulatable in galaxy
+go_categories <- lapply(go_categories, unlist)
+go_categories = goseq:::reversemapping(go_categories)
+go_categories = melt(go_categories)
+colnames(go_categories) = c("#gene_id", "go_category")
+
+write.table(go_categories, output, sep="\t", row.names = FALSE, quote = FALSE)
+sessionInfo()
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/getgo.xml	Sun Feb 28 11:52:10 2016 -0500
@@ -0,0 +1,43 @@
+<tool id="getgo" name="Retrieve GO ontologies" version="0.1.0">
+    <description />
+    <requirements>
+        <requirement type="package" version="3.2.1">R</requirement>
+        <requirement type="package" version="1.22.0">goseq</requirement>
+    </requirements>
+    <command interpreter="Rscript">
+        getgo.r --genome "$genome"
+        --gtf "$gtf"
+        --gene_id "$gene_id"
+        --output "$output"
+        --cats "$cats"
+    </command>
+    <inputs>
+        <param name="gtf" label="select GTF file" help="GO annotations for all gene ids in this GTF will be fetched" type="data" format="gtf"/>
+        <param help="Needed to retrieve GO annotations for the selected genome" label="Select the genome source" name="genome" size="3" type="select">
+            <options from_data_table="go_genomes"></options>
+        </param>
+        <param help="Needed for GO analysis" label="Select gene identifier format" name="gene_id" type="select">
+            <options from_data_table="go_gene_ids"></options>
+        </param>
+        <param name="cats" help="Select the categories for which you would like to retrieve ontologies" type="select" multiple="true" display="checkboxes">
+            <option value="GO:CC">GO:Cellular Components</option>
+            <option value="GO:BP">BiologicalProcesses</option>
+            <option value="GO:MF">Molecular Function</option>
+            <option value="KEGG">KEGG pathway</option>
+        </param>
+    </inputs>
+    <outputs>
+        <data format="tabular" label="GO category mapping" name="output" />
+    </outputs>
+    <tests></tests>
+    <help>
+
+        **What it does**
+
+        Returns a tabular file with GO gene categories.
+
+
+        </help>
+    <citations>
+    </citations>
+</tool>
--- a/goseq.r	Fri Feb 26 13:31:00 2016 -0500
+++ b/goseq.r	Sun Feb 28 11:52:10 2016 -0500
@@ -15,7 +15,9 @@
     make_option(c("-r", "--repcnt"), type="integer", default=100, help="Number of repeats for sampling"),
     make_option(c("-lf", "--length_file"), type="character", default="FALSE", help = "Path to tabular file mapping gene id to length"),
     make_option(c("-g", "--genome"), type="character", help = "Genome [used for looking up correct gene length]"),
-    make_option(c("-i", "--gene_id"), type="character", help="Gene ID of gene column in DGE file")
+    make_option(c("-i", "--gene_id"), type="character", help="Gene ID of gene column in DGE file"),
+    make_option(c("-cat", "--use_genes_without_cat"), default=FALSE, type="logical", help="A boolean to indicate whether genes without a categorie should still be used. For example, a large number of gene may have no GO term annotated.  If thisoption is set to FALSE, those genes will be ignored in the calculation of p-values(default behaviour).  If this option is set to TRUE, then these genes will count towards  the  total  number  of  genes  outside  the  category  being  tested  (default behaviour prior to version 1.15.2)."
+)
   )
 parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
 args = parse_args(parser)
@@ -33,6 +35,7 @@
 length_bias_plot = args$length_bias_plot
 sample_vs_wallenius_plot = args$sample_vs_wallenius_plot
 repcnt = args$repcnt
+use_genes_without_cat = args$use_genes_without_cat
 
 # format DE genes into vector suitable for use with goseq
 dge_table = read.delim(dge_file, header = TRUE, sep="\t", check.names = FALSE)
@@ -53,14 +56,18 @@
 pdf(length_bias_plot)
 pwf=nullp(genes, genome, gene_id, gene_lengths)
 message = dev.off()
+
+# Fetch GO annotations:
+go_map=getgo(names(genes), genome, gene_id, fetch.cats=c("GO:CC", "GO:BP", "GO:MF", "KEGG"))
+
 # wallenius approximation of p-values
-GO.wall=goseq(pwf, genome, gene_id)
+GO.wall=goseq(pwf, genome, gene_id, use_genes_without_cat = use_genes_without_cat, gene2cat=go_map)
 
-GO.nobias=goseq(pwf, genome, gene_id, method="Hypergeometric")
+GO.nobias=goseq(pwf, genome, gene_id, method="Hypergeometric", use_genes_without_cat = use_genes_without_cat, gene2cat=go_map)
 
 # Sampling distribution
 if (repcnt > 0) {
-  GO.samp=goseq(pwf,genome, gene_id, method="Sampling", repcnt=repcnt)
+  GO.samp=goseq(pwf,genome, gene_id, method="Sampling", repcnt=repcnt, use_genes_without_cat = use_genes_without_cat, gene2cat=go_map)
   # Compare sampling with wallenius
   pdf(sample_vs_wallenius_plot)
   plot(log10(GO.wall[,2]), log10(GO.samp[match(GO.samp[,1],GO.wall[,1]),2]),
--- a/goseq.xml	Fri Feb 26 13:31:00 2016 -0500
+++ b/goseq.xml	Sun Feb 28 11:52:10 2016 -0500
@@ -19,10 +19,13 @@
         --length_bias_plot "$length_bias_plot"
         --sample_vs_wallenius_plot "$sample_vs_wallenius_plot"
         --repcnt "$repcnt"
+        --use_genes_without_cat "$use_genes_without_cat"
     </command>
     <inputs>
         <param help="deseq2/edger/limma differential gene expression list" label="DGE list" name="dge_file" type="data" format="tabular" />
         <param help="Select the column that contains the multiple-testing corrected p-value" label="p adjust column" name="p_adj_column" type="data_column" numeric="true" data_ref="dge_file"/>
+        <param help="A boolean to indicate whether genes without a categorie should still be used. For example, a large number of gene may have no GO term annotated.  If this option is set to FALSE, those genes will be ignored in the calculation of p-values. If this option is set to TRUE, then these genes will count towards the total number of genes outside the category being tested"
+               name="use_genes_without_cat" label="Count genes without any category" type="boolean"/>
         <param help="Typically 0.05 after multiple testing correction" max="1" label="Minimum p adjust value to consider genes as differentially expressed" name="p_adj_cutoff" type="float" value="0.05" />
         <conditional name="source">
             <param help="This is needed if the gene length is not available in goseq. e.g. hg38 and mm10." label="Use gene length file?" name="use_length_file" type="select">
--- a/tool_dependencies.xml	Fri Feb 26 13:31:00 2016 -0500
+++ b/tool_dependencies.xml	Sun Feb 28 11:52:10 2016 -0500
@@ -4,6 +4,6 @@
          <repository changeset_revision="9f31a291b305" name="package_r_3_2_1" owner="iuc" toolshed="https://testtoolshed.g2.bx.psu.edu" />
     </package>
     <package name="goseq" version="1.22.0">
-         <repository changeset_revision="395b5f44b823" name="package_r_3_2_1_goseq_1_22_0" owner="mvdbeek" toolshed="https://testtoolshed.g2.bx.psu.edu" />
+         <repository changeset_revision="6aec352282a6" name="package_r_3_2_1_goseq_1_22_0" owner="mvdbeek" toolshed="https://testtoolshed.g2.bx.psu.edu" />
     </package>
 </tool_dependency>