Mercurial > repos > greg > gene_family_classifier

--- a/gene_family_classifier.xml	Mon Mar 20 14:28:59 2017 -0400
+++ b/gene_family_classifier.xml	Tue Mar 21 10:30:35 2017 -0400
@@ -107,8 +107,8 @@
         ]]>
     </command>
     <inputs>
-        <param name="input" format="fasta" type="data" label="Amino acids (proteins) sequences fasta file"/>
-        <param name="scaffold" type="select" label="Orthogroups or gene families proteins scaffold">
+        <param name="input" format="fasta" type="data" label="Proteins fasta file"/>
+        <param name="scaffold" type="select" label="Gene family scaffold">
             <options from_data_table="plant_tribes_scaffolds" />
             <validator type="no_options" message="No PlantTribes scaffolds are available.  Use the PlantTribes Scaffolds Download Data Manager tool in Galaxy to install and populate the PlantTribes scaffolds data table."/>
         </param>
@@ -118,91 +118,90 @@
             <option value="orthomcl">OrthoMCL</option>
         </param>
         <conditional name="save_hmmscan_log_cond">
-            <param name="classifier" type="select" label="Protein classification method">
+            <param name="classifier" type="select" label="Protein classifier">
                 <option value="blastp" selected="true">blastp</option>
                 <option value="hmmscan">HMMScan</option>
                 <option value="both">Both blastp and HMMScan</option>
             </param>
             <when value="blastp" />
             <when value="hmmscan">
-                <param name="save_hmmscan_log" type="select" label="Save hmmscan log?" help="Save the hmmscan log in an additional output dataset">
+                <param name="save_hmmscan_log" type="select" label="Save hmmscan log?">
                     <option value="no" selected="true">No</option>
                     <option value="yes">Yes</option>
                 </param>
             </when>
             <when value="both">
-                <param name="save_hmmscan_log" type="select" label="Save hmmscan log?" help="Save the hmmscan log in an additional output dataset">
+                <param name="save_hmmscan_log" type="select" label="Save hmmscan log?">
                     <option value="no" selected="true">No</option>
                     <option value="yes">Yes</option>
                 </param>
             </when>
         </conditional>
         <conditional name="options_type">
-            <param name="options_type_selector" type="select" label="Options Configuration">
+            <param name="options_type_selector" type="select" label="Options configuration">
                 <option value="basic" selected="true">Basic</option>
                 <option value="advanced">Advanced</option>
             </param>
             <when value="basic" />
             <when value="advanced">
                 <conditional name="specify_super_orthogroups_cond">
-                    <param name="specify_super_orthogroups" type="select" label="Specify super orthogroups?" help="Secondary MCL clusters of orthogroups">
+                    <param name="specify_super_orthogroups" type="select" label="Super orthogroups configuration">
                         <option value="no" selected="true">No</option>
                         <option value="yes">Yes</option>
                     </param>
                     <when value="no"/>
                     <when value="yes">
-                        <param name="super_orthogroups" type="select" label="Super orthogroups clustering specification">
-                            <option value="min_evalue" selected="true">Minimum e-value</option>
-                            <option value="avg_evalue">Average e-value</option>
+                        <param name="super_orthogroups" type="select" label="Clustering distance measure">
+                            <option value="min_evalue" selected="true">blastp e-value</option>
                         </param>
                     </when>
                 </conditional>
                 <conditional name="specify_single_copy_cond">
-                    <param name="specify_single_copy" type="select" label="Specify single copy orthogroup selection?">
+                    <param name="specify_single_copy" type="select" label="Single copy orthogroups configuration">
                         <option value="no" selected="true">No</option>
                         <option value="yes">Yes</option>
                     </param>
                     <when value="no"/>
                     <when value="yes">
                         <conditional name="single_copy_cond">
-                            <param name="single_copy" type="select" label="Select single copy orthogroup configuration option">
-                                <option value="custom" selected="true">Single copy orthogroup custom configuration</option>
-                                <option value="taxa">Minimum single copy taxa required in orthogroup</option>
+                            <param name="single_copy" type="select" label="Selection criterion">
+                                <option value="custom" selected="true">Custom selection</option>
+                                <option value="taxa">Global selection</option>
                             </param>
                             <when value="custom">
                                 <conditional name="single_copy_custom_cond">
-                                    <param name="single_copy_custom" type="select" label="Select single copy orthogroup custom configuration from the current history?" help="Select No to use the default configuration">
+                                    <param name="single_copy_custom" type="select" label="Custom selection configuration">
                                         <option value="no" selected="true">No</option>
                                         <option value="yes">Yes</option>
                                     </param>
                                     <when value="no"/>
                                     <when value="yes">
-                                        <param name="single_copy_custom_config" format="txt" type="data" label="Single copy orthogroup custom configuration file"/>
+                                        <param name="single_copy_custom_config" format="txt" type="data" label="Custom selection file"/>
                                     </when>
                                 </conditional>
                             </when>
                             <when value="taxa">
-                                <param name="single_copy_taxa" type="integer" value="20" label="Minimum single copy taxa required in orthogroup"/>
-                                <param name="taxa_present" type="integer" value="21" label="Minimum taxa required in single copy orthogroup"/>
+                                <param name="single_copy_taxa" type="integer" value="20" label="Minimum single copy taxa"/>
+                                <param name="taxa_present" type="integer" value="21" label="Minimum taxa present"/>
                             </when>
                         </conditional>
                     </when>
                 </conditional>
                 <conditional name="create_orthogroup_cond">
-                    <param name="create_orthogroup" type="select" label="Create orthogroup fasta files?">
+                    <param name="create_orthogroup" type="select" label="Orthogroups fasta configuration">
                         <option value="no" selected="true">No</option>
                         <option value="yes">Yes</option>
                     </param>
                     <when value="no" />
                     <when value="yes">
                         <conditional name="create_corresponding_coding_sequences_cond">
-                            <param name="create_corresponding_coding_sequences" type="select" label="Create corresponding coding sequences?">
+                            <param name="create_corresponding_coding_sequences" type="select" label="Orthogroups coding sequences">
                                 <option value="no" selected="true">No</option>
                                 <option value="yes">Yes</option>
                             </param>
                             <when value="no" />
                             <when value="yes">
-                                <param name="coding_sequences" format="fasta" type="data" label="Corresponding coding sequences (CDS) fasta file"/>
+                                <param name="coding_sequences" format="fasta" type="data" label="Coding sequences fasta file"/>
                             </when>
                         </conditional>
                     </when>
@@ -247,31 +246,48 @@
         </test>
     </tests>
     <help>
-This tool is one of the PlantTribes collection of automated modular analysis pipelines that utilize objective classifications of
-complete protein sequences from sequenced plant genomes to perform comparative evolutionary studies.  This tool classifies gene
-sequences into precomputed orthologous gene family clusters using either blastp (faster), HMMScan (slower but more sensitive
-to remote homologs) or both (more exhaustive).
-
-This tool accepts any of the following as input:
-
-* the postprocessed assemblies produced by the **Postprocess de novo assembly transcripts into putative coding sequences** tool
-* externally predicted coding sequences and their corresponding amino acid translations derived from a transcriptome assembly
-* gene predictions from a sequenced genome
+This tool is one of the PlantTribes collection of automated modular analysis pipelines for comparative and evolutionary
+analyses of genome-scale gene families and transcriptomes. This tool classifies gene coding sequences either produced by
+the AssemblyPostProcessor tool or from an external source into pre-computed orthologous gene family clusters (orthogroups)
+of a PlantTribes scaffold.  Classified sequences are then assigned with the corresponding orthogroups’ metadata that includes
+gene counts of backbone taxa, super clusters (super orthogoups) at multiple stringencies, and functional annotations from
+sources such as Gene Ontology (GO), InterPro protein domains, and UniProt KB/Swiss-Prot.  Additionally, sequences belonging
+to single/low-copy gene families that are mainly utilized in species tree inference can be determined.

 -----

-**Options**
+**Required options**
+
+ * **Proteins fasta file** - proteins fasta file either produced by the AssemblyPostProcessor tool or an external source selected from your history.
+ * **Gene family scaffold** - one of the PlantTribes gene family scaffolds [2-4] installed into Galaxy by the PlantTribes Scaffold Data Manager tool.
+ * **Protein clustering method** - gene family scaffold protein clustering method as described in the AssemblyPostProcessor tool.
+ * **Protein classifier** - Classifier to assign protein sequences into a specified scaffold orthogroups. PlantTribes implements three classification approaches; blastp (faster)[5], hmmscan (slower but more sensitive to the remote homologs)[6], and both blastp and hmmscan (more exhaustive).
+
+**Other options**
+
+ * **Super orthogroups configuration** - select ‘Yes’ to enable super orthogroups configuration options.  Super orthogroups are constructed through a second iteration of MCL clustering to connect distant, but potentially related orthogroup clusters.
+
+   * **Clustering distance measure** - distance measure used in merging orthogroup clusters into super orthogroup clusters.  PlantTribes pre-computed super orthogroups are based on the minimum and average blastp e-value between all pairs of scaffold orthogroups used as the input matrix for MCL clustering algorithm[7].
+
+ * **Single copy orthogroups configuration** - select ‘Yes’ to enable single/low-copy orthogroups selection configuration options.

- * **Orthogroups or gene families proteins scaffold** - PlantTribes scaffolds data installed into Galaxy by the PlantTribes Scaffolds Download Data Manager tool.
- * **Protein clustering method** - One of GFam (domain architecture based clustering), OrthoFinder (broadly defined clusters) or OrthoMCL (narrowly defined clusters).
- * **Protein classification method** - blastp (faster), HMMScan (slower but more sensative to the remote homologs) or both (more exhaustive).
- * **Super Orthogroups** - Secondary MCL clusters of orthogroups.
- * **Specify single copy orthogroup selection?** - Specify a single copy orthogroup custom configuration or the minimum single copy taxa required in the orthogroup.
- * **Select single copy orthogroup custom configuration from the current history?** - If a custom configuration is chosen, the configuration can be selected from the current history or the default configuration can be used.
- * **Minimum single copy taxa required in orthogroup** - Used with "Minimum single copy taxa required in orthogroup" configuration only.
- * **Minimum taxa required in single copy orthogroup** - Used with "Minimum single copy taxa required in orthogroup" configuration only.
- * **Corresponding coding sequences (CDS) fasta file** - Used only when selecting "Create orthogroup fasta files?".
-
+   * **Selection criterion** - single/low-copy orthogroups selection criterion. PlantTribes provides custom and global selection criteria for selecting user defined single/low-copy scaffold orthogoups.
+
+     * **Custom selection configuration** - select ‘Yes’ to enable selection of a single copy configuration file.  Scaffold configuration templates(.singleCopy.config) of how to customize single/low-copy orthogroups selection can be found in the scaffold data installed into Galaxy via the PlantTribes Scaffolds Download Data Manager tool, and also available at the PlantTribes GitHub repository (https://github.com/dePamphilis/PlantTribes/config ).  Single/low-copy settings shown in these templates are used as defaults if ‘No’ is selected.
+
+       * **Custom selection file** - select a single/low-copy customized configuration file from your history.
+
+     * **Global selection configuration** - Used with "Global selection" configuration only.
+
+       * **Minimum single copy taxa** - Minimum number of taxa with single copy genes in the orthogroup.
+       * **Minimum taxa present** - Minimum number of taxa present in the orthogroup.
+
+ * **Orthogroups fasta configuration** - select ‘Yes’ to create proteins orthogroups fasta files for the classified sequences.
+
+   * **Orthogroups coding sequences** - select ‘Yes’ to create corresponding coding sequences orthogroups fasta files for the classified protein sequences. Requires coding sequences fasta file corresponding proteins fasta file to be selected from your history.
+
+     * **Coding sequences fasta file** - select coding sequences fasta file corresponding to the proteins fasta file from your history.
+
     </help>
     <citations>
         <citation type="bibtex">
@@ -283,9 +299,67 @@
             url = {https://github.com/dePamphilis/PlantTribes},}
         </citation>
         <citation type="bibtex">
+            @article{Sasidharan2012,
+            journal = {Nucleic Acids Research},
+            author = {2. Sasidharan R, Nepusz T, Swarbreck D, Huala E, Paccanaro A},
+            title = {GFam: a platform for automatic annotation of gene families},
+            year = {2012},
+            pages = {gks631},}
+        </citation>
+        <citation type="bibtex">
+            @article{Li2003,
+            journal = {Genome Research}
+            author = {3. Li L, Stoeckert CJ, Roos DS},
+            title = {OrthoMCL: identification of ortholog groups for eukaryotic genomes},
+            year = {2003},
+            volume = {13},
+            number = {9},
+            pages = {2178-2189},}
+        </citation>
+        <citation type="bibtex">
+            @article{Emms2015,
+            journal = {Genome Biology}
+            author = {4. Emms DM, Kelly S},
+            title = {OrthoFinder: solving fundamental biases in whole genome comparisons dramatically improves orthogroup inference accuracy},
+            year = {2015},
+            volume = {16},
+            number = {1},
+            pages = {157},}
+        </citation>
+        <citation type="bibtex">
+            @article{Altschul1990,
+            journal = {Journal of molecular biology}
+            author = {5. Altschul SF, Gish W, Miller W, Myers EW, Lipman DJ},
+            title = {Basic local alignment search tool},
+            year = {1990},
+            volume = {215},
+            number = {3},
+            pages = {403-410},}
+        </citation>
+        <citation type="bibtex">
+            @article{Eddy2009,
+            journal = {Genome Inform},
+            author = {6. Eddy SR},
+            title = {A new generation of homology search tools based on probabilistic inference},
+            year = {2009},
+            volume = {23},
+            number = {1},
+            pages = {205-211},}
+        </citation>
+        <citation type="bibtex">
+            @article{Enright2002,
+            journal = {Nucleic acids research},
+            author = {7. Enright AJ, Van Dongen S, Ouzounis CA},
+            title = {n efficient algorithm for large-scale detection of protein families},
+            year = {2002},
+            volume = {30},
+            number = {7},
+            pages = {1575-1584},}
+        </citation>
+        <citation type="bibtex">
             @article{None,
             journal = {GitHub repository},
-            author = {2. None},
+            author = {8. None},
             title = {HMMER 3.1+ hmmscan search sequence(s) against a profile database},
             year = {2013},
             url = {http://hmmer.org},}