comparison goseq.xml @ 7:9ffae7bc23c2 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit f95b47ed1a09ce14d3b565e8ea56d8bf12c35814-dirty
author mvdbeek
date Mon, 07 Mar 2016 13:57:32 -0500
parents 0e9424413ab0
children 04b9c519d3e1
comparison
equal deleted inserted replaced
6:0e9424413ab0 7:9ffae7bc23c2
1 <tool id="goseq" name="goseq" version="0.2.0"> 1 <tool id="goseq" name="goseq" version="0.2.2">
2 <description>tests for overrepresented gene categories</description> 2 <description>tests for overrepresented gene categories</description>
3 <macros> 3 <macros>
4 <import>go_macros.xml</import> 4 <import>go_macros.xml</import>
5 </macros> 5 </macros>
6 <expand macro="requirements" /> 6 <expand macro="requirements" />
7 <expand macro="stdio" /> 7 <expand macro="stdio" />
8 <command interpreter="Rscript"> 8 <command interpreter="Rscript">
9 goseq.r --dge_file "$dge_file" 9 goseq.r --dge_file "$dge_file"
10 --p_adj_column "$p_adj_column"
11 --cutoff "$p_adj_cutoff"
12 --length_file "$length_file" 10 --length_file "$length_file"
13 --category_file "$category_file" 11 --category_file "$category_file"
12 #if "$methods['wallenius']":
14 --wallenius_tab "$wallenius_tab" 13 --wallenius_tab "$wallenius_tab"
14 #end if
15 #if "$methods['hypergeometric']":
16 --nobias_tab "$nobias_tab"
17 #end if
18 --repcnt "$methods.repcnt"
15 --sampling_tab "$sampling_tab" 19 --sampling_tab "$sampling_tab"
16 --nobias_tab "$nobias_tab" 20 --p_adj_method "$p_adj_method"
21 --use_genes_without_cat "$use_genes_without_cat"
22 --make_plots "$make_plots"
17 --length_bias_plot "$length_bias_plot" 23 --length_bias_plot "$length_bias_plot"
18 --sample_vs_wallenius_plot "$sample_vs_wallenius_plot" 24 --sample_vs_wallenius_plot "$sample_vs_wallenius_plot"
19 --repcnt "$repcnt"
20 --use_genes_without_cat "$use_genes_without_cat"
21 --make_plots "$make_plots"
22 </command> 25 </command>
23 <inputs> 26 <inputs>
24 <param help="deseq2/edger/limma differential gene expression list" label="DGE list" name="dge_file" type="data" format="tabular" /> 27 <param help="A tabular file with gene names in the first column, and TRUE or FALSE in the last column. TRUE means a gene is differentially expressed. See help section for details." label="Differentially expressed gene file" name="dge_file" type="data" format="tabular" />
25 <param help="Select the column that contains the multiple-testing corrected p-value" label="p adjust column" name="p_adj_column" type="data_column" numeric="true" data_ref="dge_file"/>
26 <param label="Gene length file for length bias correction" help="You can calculate the gene length using the get length and gc content tool" name="length_file" type="data" format="tabular" required="true" /> 28 <param label="Gene length file for length bias correction" help="You can calculate the gene length using the get length and gc content tool" name="length_file" type="data" format="tabular" required="true" />
27 <param label="Gene category file" help="You can obtain a mapping of gene id to gene ontology using the getgo tool" name="category_file" type="data" format="tabular" required="true" /> 29 <param label="Gene category file" help="You can obtain a mapping of gene id to gene ontology using the getgo tool" name="category_file" type="data" format="tabular" required="true" />
28 <param help="For example, a large number of gene may have no GO term annotated. If this option is set to FALSE, those genes will be ignored in the calculation of p-values. If this option is set to TRUE, then these genes will count towards the total number of genes outside the category being tested" 30 <param help="For example, a large number of gene may have no GO term annotated. If this option is set to FALSE, those genes will be ignored in the calculation of p-values. If this option is set to TRUE, then these genes will count towards the total number of genes outside the category being tested"
29 name="use_genes_without_cat" label="Count genes without any category?" type="boolean"/> 31 name="use_genes_without_cat" label="Count genes without any category?" type="boolean"/>
30 <param help="Typically 0.05 after multiple testing correction" max="1" label="Minimum p adjust value to consider genes as differentially expressed" name="p_adj_cutoff" type="float" value="0.05" /> 32 <section name="methods" title="Method options" expanded="True">
33 <param name="wallenius" type="boolean" checked="true" label="Use wallenius method" help="See help for details" />
34 <param name="hypergeometric" type="boolean" checked="false" label="Use hypergeometric method" help="Does not use gene length information. See help for details" />
35 <param help="Draw this many random control gene sets. Set to 0 to not do sampling. Larger values take a long time" label="sampling depth" name="repcnt" size="3" type="integer" min="0" max="10000" value="0" />
36 </section>
37 <param name="p_adj_method" type="select" label="Select a method for multiple hypothesis testing correction">
38 <option value="BH" selected="true">Benjamini-Hochberg [FDR] (1995)</option>
39 <option value="holm">Holm (1979)</option>
40 <option value="hommel">Hommel (1988)</option>
41 <option value="hochberg">Hochberg (1988)</option>
42 <option value="bonferroni">Bonferroni</option>
43 <option value="BY">Benjamini - Yekutieli (2001)</option>
44 </param>
31 <param help="These plots may help you compare the different p-value estimation methods that goseq can use." label="Produce diagnostic plots?" name="make_plots" type="boolean"></param> 45 <param help="These plots may help you compare the different p-value estimation methods that goseq can use." label="Produce diagnostic plots?" name="make_plots" type="boolean"></param>
32 <param help="Draw this many random control gene sets. Set to 0 to not do sampling. Larger values take a long time" label="sampling depth" name="repcnt" size="3" type="integer" min="0" max="10000" value="0" />
33 </inputs> 46 </inputs>
34 <outputs> 47 <outputs>
35 <data format="pdf" label="length bias plot" name="length_bias_plot"> 48 <data format="pdf" label="length bias plot" name="length_bias_plot">
36 <filter>make_plots</filter> 49 <filter>make_plots</filter>
50 <filter>methods['hypergeometric']</filter>
37 </data> 51 </data>
38 <data format="pdf" label="Plot P-value from sampling against wallenius distribution" name="sample_vs_wallenius_plot"> 52 <data format="pdf" label="Plot P-value from sampling against wallenius distribution" name="sample_vs_wallenius_plot">
39 <filter>repcnt != 0</filter> 53 <filter>methods['repcnt'] != 0</filter>
54 <filter>methods['wallenius']</filter>
40 <filter>make_plots</filter> 55 <filter>make_plots</filter>
41 </data> 56 </data>
42 <data format="tabular" label="Ranked category list - no length bias correction" name="nobias_tab" /> 57 <data format="tabular" label="Ranked category list - no length bias correction" name="nobias_tab">
58 <filter>methods['hypergeometric']</filter>
59 </data>
43 <data format="tabular" label="Ranked category list - sampling" name="sampling_tab"> 60 <data format="tabular" label="Ranked category list - sampling" name="sampling_tab">
44 <filter>repcnt != 0</filter> 61 <filter>methods['repcnt'] != 0</filter>
45 </data> 62 </data>
46 <data format="tabular" label="Ranked category list - wallenius approx. of p-values" name="wallenius_tab" /> 63 <data format="tabular" label="Ranked category list - wallenius method" name="wallenius_tab">
64 <filter>methods['wallenius']</filter>
65 </data>
47 </outputs> 66 </outputs>
48 <tests> 67 <tests>
49 <test> 68 <test>
50 <param name="dge_file" value="dge_list.tab" ftype="tabular"/> 69 <param name="dge_file" value="dge_list.tab" ftype="tabular"/>
51 <param name="length_file" value="gene_length.tab" ftype="tabular"/> 70 <param name="length_file" value="gene_length.tab" ftype="tabular"/>
64 Options map closely to the excellent manual_ 83 Options map closely to the excellent manual_
65 84
66 85
67 **Input files** 86 **Input files**
68 87
69 goseq needs information about the length of a gene to correct for potential length bias in differentially expressed genes. 88 *DGE list:*
89 goseq needs a tabular file with genes in the first column, and TRUE or FALSE in the last column.
90 TRUE means the gene should count as differentially expressed, FALSE means it is not differentially expressed.
91 You can use the "Compute an expression on every row" tool to create a TRUE / FALSE column for your dataset.
92
93 *Gene length file:*
94 goseq needs information about the length of a gene to correct for potential length bias in differentially expressed genes
95 using a prodbability weight function (PWF).
70 The format of this file is tabular, with gene_id in the first column and length in the second column. 96 The format of this file is tabular, with gene_id in the first column and length in the second column.
71 The "get length and gc content" tool can produce such a file. 97 The "get length and gc content" tool can produce such a file.
72 98
99 *Gene category file:*
73 You will also need a file describing the membership of genes in categories. The format of this file is gene_id in the first column, 100 You will also need a file describing the membership of genes in categories. The format of this file is gene_id in the first column,
74 category name in the second column. If you are interested in gene ontology categories you can use the getgo file to retrive 101 category name in the second column. If you are interested in gene ontology categories you can use the getgo file to retrive
75 gene ontologies for model organisms, or you can construct your own file. 102 gene ontologies for model organisms, or you can construct your own file.
103
104 **Method options**
105
106 3 methods, "Wallenius", "Sampling" and "Hypergeometric", can be used to calculate the p-values as follows.
107
108 *"Wallenius"* approximates the true distribution of numbers of members of a category amongst DE genes by the Wallenius non-central hypergeometric distribution.
109 This distribution assumes that within a category all genes have the same probability of being chosen.
110 Therefore, this approximation works best when the range in probabilities obtained by the probability weighting function is small.
111
112 *"Sampling"* uses random sampling to approximate the true distribution and uses it to calculate the p-values for over (and under) representation of categories.
113 Although this is the most accurate method given a high enough value of sampling depth, its use quickly becomes computationally prohibitive.
114
115 *"Hypergeometric"* assumes there is no bias in power to detect differential expression at all and calculates the p-values using a standard hypergeometric distribution.
116 Useful if you wish to test the effect of selection bias on your results.
117
118 CAUTION: "Hypergeometric" should NEVER be used for producing results for biological interpretation.
119 If there is genuinely no bias in power to detect DE in your experiment, the PWF will reflect this and the other methods will produce accuracte results.
76 120
77 .. _manual: https://bioconductor.org/packages/release/bioc/vignettes/goseq/inst/doc/goseq.pdf 121 .. _manual: https://bioconductor.org/packages/release/bioc/vignettes/goseq/inst/doc/goseq.pdf
78 122
79 123
80 </help> 124 </help>