Mercurial > repos > mvdbeek > r_goseq_1_22_0
comparison goseq.xml @ 7:9ffae7bc23c2 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/goseq_1_22_0 commit f95b47ed1a09ce14d3b565e8ea56d8bf12c35814-dirty
author | mvdbeek |
---|---|
date | Mon, 07 Mar 2016 13:57:32 -0500 |
parents | 0e9424413ab0 |
children | 04b9c519d3e1 |
comparison
equal
deleted
inserted
replaced
6:0e9424413ab0 | 7:9ffae7bc23c2 |
---|---|
1 <tool id="goseq" name="goseq" version="0.2.0"> | 1 <tool id="goseq" name="goseq" version="0.2.2"> |
2 <description>tests for overrepresented gene categories</description> | 2 <description>tests for overrepresented gene categories</description> |
3 <macros> | 3 <macros> |
4 <import>go_macros.xml</import> | 4 <import>go_macros.xml</import> |
5 </macros> | 5 </macros> |
6 <expand macro="requirements" /> | 6 <expand macro="requirements" /> |
7 <expand macro="stdio" /> | 7 <expand macro="stdio" /> |
8 <command interpreter="Rscript"> | 8 <command interpreter="Rscript"> |
9 goseq.r --dge_file "$dge_file" | 9 goseq.r --dge_file "$dge_file" |
10 --p_adj_column "$p_adj_column" | |
11 --cutoff "$p_adj_cutoff" | |
12 --length_file "$length_file" | 10 --length_file "$length_file" |
13 --category_file "$category_file" | 11 --category_file "$category_file" |
12 #if "$methods['wallenius']": | |
14 --wallenius_tab "$wallenius_tab" | 13 --wallenius_tab "$wallenius_tab" |
14 #end if | |
15 #if "$methods['hypergeometric']": | |
16 --nobias_tab "$nobias_tab" | |
17 #end if | |
18 --repcnt "$methods.repcnt" | |
15 --sampling_tab "$sampling_tab" | 19 --sampling_tab "$sampling_tab" |
16 --nobias_tab "$nobias_tab" | 20 --p_adj_method "$p_adj_method" |
21 --use_genes_without_cat "$use_genes_without_cat" | |
22 --make_plots "$make_plots" | |
17 --length_bias_plot "$length_bias_plot" | 23 --length_bias_plot "$length_bias_plot" |
18 --sample_vs_wallenius_plot "$sample_vs_wallenius_plot" | 24 --sample_vs_wallenius_plot "$sample_vs_wallenius_plot" |
19 --repcnt "$repcnt" | |
20 --use_genes_without_cat "$use_genes_without_cat" | |
21 --make_plots "$make_plots" | |
22 </command> | 25 </command> |
23 <inputs> | 26 <inputs> |
24 <param help="deseq2/edger/limma differential gene expression list" label="DGE list" name="dge_file" type="data" format="tabular" /> | 27 <param help="A tabular file with gene names in the first column, and TRUE or FALSE in the last column. TRUE means a gene is differentially expressed. See help section for details." label="Differentially expressed gene file" name="dge_file" type="data" format="tabular" /> |
25 <param help="Select the column that contains the multiple-testing corrected p-value" label="p adjust column" name="p_adj_column" type="data_column" numeric="true" data_ref="dge_file"/> | |
26 <param label="Gene length file for length bias correction" help="You can calculate the gene length using the get length and gc content tool" name="length_file" type="data" format="tabular" required="true" /> | 28 <param label="Gene length file for length bias correction" help="You can calculate the gene length using the get length and gc content tool" name="length_file" type="data" format="tabular" required="true" /> |
27 <param label="Gene category file" help="You can obtain a mapping of gene id to gene ontology using the getgo tool" name="category_file" type="data" format="tabular" required="true" /> | 29 <param label="Gene category file" help="You can obtain a mapping of gene id to gene ontology using the getgo tool" name="category_file" type="data" format="tabular" required="true" /> |
28 <param help="For example, a large number of gene may have no GO term annotated. If this option is set to FALSE, those genes will be ignored in the calculation of p-values. If this option is set to TRUE, then these genes will count towards the total number of genes outside the category being tested" | 30 <param help="For example, a large number of gene may have no GO term annotated. If this option is set to FALSE, those genes will be ignored in the calculation of p-values. If this option is set to TRUE, then these genes will count towards the total number of genes outside the category being tested" |
29 name="use_genes_without_cat" label="Count genes without any category?" type="boolean"/> | 31 name="use_genes_without_cat" label="Count genes without any category?" type="boolean"/> |
30 <param help="Typically 0.05 after multiple testing correction" max="1" label="Minimum p adjust value to consider genes as differentially expressed" name="p_adj_cutoff" type="float" value="0.05" /> | 32 <section name="methods" title="Method options" expanded="True"> |
33 <param name="wallenius" type="boolean" checked="true" label="Use wallenius method" help="See help for details" /> | |
34 <param name="hypergeometric" type="boolean" checked="false" label="Use hypergeometric method" help="Does not use gene length information. See help for details" /> | |
35 <param help="Draw this many random control gene sets. Set to 0 to not do sampling. Larger values take a long time" label="sampling depth" name="repcnt" size="3" type="integer" min="0" max="10000" value="0" /> | |
36 </section> | |
37 <param name="p_adj_method" type="select" label="Select a method for multiple hypothesis testing correction"> | |
38 <option value="BH" selected="true">Benjamini-Hochberg [FDR] (1995)</option> | |
39 <option value="holm">Holm (1979)</option> | |
40 <option value="hommel">Hommel (1988)</option> | |
41 <option value="hochberg">Hochberg (1988)</option> | |
42 <option value="bonferroni">Bonferroni</option> | |
43 <option value="BY">Benjamini - Yekutieli (2001)</option> | |
44 </param> | |
31 <param help="These plots may help you compare the different p-value estimation methods that goseq can use." label="Produce diagnostic plots?" name="make_plots" type="boolean"></param> | 45 <param help="These plots may help you compare the different p-value estimation methods that goseq can use." label="Produce diagnostic plots?" name="make_plots" type="boolean"></param> |
32 <param help="Draw this many random control gene sets. Set to 0 to not do sampling. Larger values take a long time" label="sampling depth" name="repcnt" size="3" type="integer" min="0" max="10000" value="0" /> | |
33 </inputs> | 46 </inputs> |
34 <outputs> | 47 <outputs> |
35 <data format="pdf" label="length bias plot" name="length_bias_plot"> | 48 <data format="pdf" label="length bias plot" name="length_bias_plot"> |
36 <filter>make_plots</filter> | 49 <filter>make_plots</filter> |
50 <filter>methods['hypergeometric']</filter> | |
37 </data> | 51 </data> |
38 <data format="pdf" label="Plot P-value from sampling against wallenius distribution" name="sample_vs_wallenius_plot"> | 52 <data format="pdf" label="Plot P-value from sampling against wallenius distribution" name="sample_vs_wallenius_plot"> |
39 <filter>repcnt != 0</filter> | 53 <filter>methods['repcnt'] != 0</filter> |
54 <filter>methods['wallenius']</filter> | |
40 <filter>make_plots</filter> | 55 <filter>make_plots</filter> |
41 </data> | 56 </data> |
42 <data format="tabular" label="Ranked category list - no length bias correction" name="nobias_tab" /> | 57 <data format="tabular" label="Ranked category list - no length bias correction" name="nobias_tab"> |
58 <filter>methods['hypergeometric']</filter> | |
59 </data> | |
43 <data format="tabular" label="Ranked category list - sampling" name="sampling_tab"> | 60 <data format="tabular" label="Ranked category list - sampling" name="sampling_tab"> |
44 <filter>repcnt != 0</filter> | 61 <filter>methods['repcnt'] != 0</filter> |
45 </data> | 62 </data> |
46 <data format="tabular" label="Ranked category list - wallenius approx. of p-values" name="wallenius_tab" /> | 63 <data format="tabular" label="Ranked category list - wallenius method" name="wallenius_tab"> |
64 <filter>methods['wallenius']</filter> | |
65 </data> | |
47 </outputs> | 66 </outputs> |
48 <tests> | 67 <tests> |
49 <test> | 68 <test> |
50 <param name="dge_file" value="dge_list.tab" ftype="tabular"/> | 69 <param name="dge_file" value="dge_list.tab" ftype="tabular"/> |
51 <param name="length_file" value="gene_length.tab" ftype="tabular"/> | 70 <param name="length_file" value="gene_length.tab" ftype="tabular"/> |
64 Options map closely to the excellent manual_ | 83 Options map closely to the excellent manual_ |
65 | 84 |
66 | 85 |
67 **Input files** | 86 **Input files** |
68 | 87 |
69 goseq needs information about the length of a gene to correct for potential length bias in differentially expressed genes. | 88 *DGE list:* |
89 goseq needs a tabular file with genes in the first column, and TRUE or FALSE in the last column. | |
90 TRUE means the gene should count as differentially expressed, FALSE means it is not differentially expressed. | |
91 You can use the "Compute an expression on every row" tool to create a TRUE / FALSE column for your dataset. | |
92 | |
93 *Gene length file:* | |
94 goseq needs information about the length of a gene to correct for potential length bias in differentially expressed genes | |
95 using a prodbability weight function (PWF). | |
70 The format of this file is tabular, with gene_id in the first column and length in the second column. | 96 The format of this file is tabular, with gene_id in the first column and length in the second column. |
71 The "get length and gc content" tool can produce such a file. | 97 The "get length and gc content" tool can produce such a file. |
72 | 98 |
99 *Gene category file:* | |
73 You will also need a file describing the membership of genes in categories. The format of this file is gene_id in the first column, | 100 You will also need a file describing the membership of genes in categories. The format of this file is gene_id in the first column, |
74 category name in the second column. If you are interested in gene ontology categories you can use the getgo file to retrive | 101 category name in the second column. If you are interested in gene ontology categories you can use the getgo file to retrive |
75 gene ontologies for model organisms, or you can construct your own file. | 102 gene ontologies for model organisms, or you can construct your own file. |
103 | |
104 **Method options** | |
105 | |
106 3 methods, "Wallenius", "Sampling" and "Hypergeometric", can be used to calculate the p-values as follows. | |
107 | |
108 *"Wallenius"* approximates the true distribution of numbers of members of a category amongst DE genes by the Wallenius non-central hypergeometric distribution. | |
109 This distribution assumes that within a category all genes have the same probability of being chosen. | |
110 Therefore, this approximation works best when the range in probabilities obtained by the probability weighting function is small. | |
111 | |
112 *"Sampling"* uses random sampling to approximate the true distribution and uses it to calculate the p-values for over (and under) representation of categories. | |
113 Although this is the most accurate method given a high enough value of sampling depth, its use quickly becomes computationally prohibitive. | |
114 | |
115 *"Hypergeometric"* assumes there is no bias in power to detect differential expression at all and calculates the p-values using a standard hypergeometric distribution. | |
116 Useful if you wish to test the effect of selection bias on your results. | |
117 | |
118 CAUTION: "Hypergeometric" should NEVER be used for producing results for biological interpretation. | |
119 If there is genuinely no bias in power to detect DE in your experiment, the PWF will reflect this and the other methods will produce accuracte results. | |
76 | 120 |
77 .. _manual: https://bioconductor.org/packages/release/bioc/vignettes/goseq/inst/doc/goseq.pdf | 121 .. _manual: https://bioconductor.org/packages/release/bioc/vignettes/goseq/inst/doc/goseq.pdf |
78 | 122 |
79 | 123 |
80 </help> | 124 </help> |