# HG changeset patch # User greg # Date 1490106635 14400 # Node ID cf5a7194d866d79dd4075e3381b01286cdeb6de0 # Parent e64f6cdfa6b35012c6afdb6c0c329d5293c8c225 Uploaded diff -r e64f6cdfa6b3 -r cf5a7194d866 gene_family_classifier.xml --- a/gene_family_classifier.xml Mon Mar 20 14:28:59 2017 -0400 +++ b/gene_family_classifier.xml Tue Mar 21 10:30:35 2017 -0400 @@ -107,8 +107,8 @@ ]]> - - + + @@ -118,91 +118,90 @@ - + - + - + - + - + - - - + + - + - - - + + + - + - + - - + + - + - + - + @@ -247,31 +246,48 @@ -This tool is one of the PlantTribes collection of automated modular analysis pipelines that utilize objective classifications of -complete protein sequences from sequenced plant genomes to perform comparative evolutionary studies. This tool classifies gene -sequences into precomputed orthologous gene family clusters using either blastp (faster), HMMScan (slower but more sensitive -to remote homologs) or both (more exhaustive). - -This tool accepts any of the following as input: - -* the postprocessed assemblies produced by the **Postprocess de novo assembly transcripts into putative coding sequences** tool -* externally predicted coding sequences and their corresponding amino acid translations derived from a transcriptome assembly -* gene predictions from a sequenced genome +This tool is one of the PlantTribes collection of automated modular analysis pipelines for comparative and evolutionary +analyses of genome-scale gene families and transcriptomes. This tool classifies gene coding sequences either produced by +the AssemblyPostProcessor tool or from an external source into pre-computed orthologous gene family clusters (orthogroups) +of a PlantTribes scaffold. Classified sequences are then assigned with the corresponding orthogroups’ metadata that includes +gene counts of backbone taxa, super clusters (super orthogoups) at multiple stringencies, and functional annotations from +sources such as Gene Ontology (GO), InterPro protein domains, and UniProt KB/Swiss-Prot. Additionally, sequences belonging +to single/low-copy gene families that are mainly utilized in species tree inference can be determined. ----- -**Options** +**Required options** + + * **Proteins fasta file** - proteins fasta file either produced by the AssemblyPostProcessor tool or an external source selected from your history. + * **Gene family scaffold** - one of the PlantTribes gene family scaffolds [2-4] installed into Galaxy by the PlantTribes Scaffold Data Manager tool. + * **Protein clustering method** - gene family scaffold protein clustering method as described in the AssemblyPostProcessor tool. + * **Protein classifier** - Classifier to assign protein sequences into a specified scaffold orthogroups. PlantTribes implements three classification approaches; blastp (faster)[5], hmmscan (slower but more sensitive to the remote homologs)[6], and both blastp and hmmscan (more exhaustive). + +**Other options** + + * **Super orthogroups configuration** - select ‘Yes’ to enable super orthogroups configuration options. Super orthogroups are constructed through a second iteration of MCL clustering to connect distant, but potentially related orthogroup clusters. + + * **Clustering distance measure** - distance measure used in merging orthogroup clusters into super orthogroup clusters. PlantTribes pre-computed super orthogroups are based on the minimum and average blastp e-value between all pairs of scaffold orthogroups used as the input matrix for MCL clustering algorithm[7]. + + * **Single copy orthogroups configuration** - select ‘Yes’ to enable single/low-copy orthogroups selection configuration options. - * **Orthogroups or gene families proteins scaffold** - PlantTribes scaffolds data installed into Galaxy by the PlantTribes Scaffolds Download Data Manager tool. - * **Protein clustering method** - One of GFam (domain architecture based clustering), OrthoFinder (broadly defined clusters) or OrthoMCL (narrowly defined clusters). - * **Protein classification method** - blastp (faster), HMMScan (slower but more sensative to the remote homologs) or both (more exhaustive). - * **Super Orthogroups** - Secondary MCL clusters of orthogroups. - * **Specify single copy orthogroup selection?** - Specify a single copy orthogroup custom configuration or the minimum single copy taxa required in the orthogroup. - * **Select single copy orthogroup custom configuration from the current history?** - If a custom configuration is chosen, the configuration can be selected from the current history or the default configuration can be used. - * **Minimum single copy taxa required in orthogroup** - Used with "Minimum single copy taxa required in orthogroup" configuration only. - * **Minimum taxa required in single copy orthogroup** - Used with "Minimum single copy taxa required in orthogroup" configuration only. - * **Corresponding coding sequences (CDS) fasta file** - Used only when selecting "Create orthogroup fasta files?". - + * **Selection criterion** - single/low-copy orthogroups selection criterion. PlantTribes provides custom and global selection criteria for selecting user defined single/low-copy scaffold orthogoups. + + * **Custom selection configuration** - select ‘Yes’ to enable selection of a single copy configuration file. Scaffold configuration templates(.singleCopy.config) of how to customize single/low-copy orthogroups selection can be found in the scaffold data installed into Galaxy via the PlantTribes Scaffolds Download Data Manager tool, and also available at the PlantTribes GitHub repository (https://github.com/dePamphilis/PlantTribes/config ). Single/low-copy settings shown in these templates are used as defaults if ‘No’ is selected. + + * **Custom selection file** - select a single/low-copy customized configuration file from your history. + + * **Global selection configuration** - Used with "Global selection" configuration only. + + * **Minimum single copy taxa** - Minimum number of taxa with single copy genes in the orthogroup. + * **Minimum taxa present** - Minimum number of taxa present in the orthogroup. + + * **Orthogroups fasta configuration** - select ‘Yes’ to create proteins orthogroups fasta files for the classified sequences. + + * **Orthogroups coding sequences** - select ‘Yes’ to create corresponding coding sequences orthogroups fasta files for the classified protein sequences. Requires coding sequences fasta file corresponding proteins fasta file to be selected from your history. + + * **Coding sequences fasta file** - select coding sequences fasta file corresponding to the proteins fasta file from your history. + @@ -283,9 +299,67 @@ url = {https://github.com/dePamphilis/PlantTribes},} + @article{Sasidharan2012, + journal = {Nucleic Acids Research}, + author = {2. Sasidharan R, Nepusz T, Swarbreck D, Huala E, Paccanaro A}, + title = {GFam: a platform for automatic annotation of gene families}, + year = {2012}, + pages = {gks631},} + + + @article{Li2003, + journal = {Genome Research} + author = {3. Li L, Stoeckert CJ, Roos DS}, + title = {OrthoMCL: identification of ortholog groups for eukaryotic genomes}, + year = {2003}, + volume = {13}, + number = {9}, + pages = {2178-2189},} + + + @article{Emms2015, + journal = {Genome Biology} + author = {4. Emms DM, Kelly S}, + title = {OrthoFinder: solving fundamental biases in whole genome comparisons dramatically improves orthogroup inference accuracy}, + year = {2015}, + volume = {16}, + number = {1}, + pages = {157},} + + + @article{Altschul1990, + journal = {Journal of molecular biology} + author = {5. Altschul SF, Gish W, Miller W, Myers EW, Lipman DJ}, + title = {Basic local alignment search tool}, + year = {1990}, + volume = {215}, + number = {3}, + pages = {403-410},} + + + @article{Eddy2009, + journal = {Genome Inform}, + author = {6. Eddy SR}, + title = {A new generation of homology search tools based on probabilistic inference}, + year = {2009}, + volume = {23}, + number = {1}, + pages = {205-211},} + + + @article{Enright2002, + journal = {Nucleic acids research}, + author = {7. Enright AJ, Van Dongen S, Ouzounis CA}, + title = {n efficient algorithm for large-scale detection of protein families}, + year = {2002}, + volume = {30}, + number = {7}, + pages = {1575-1584},} + + @article{None, journal = {GitHub repository}, - author = {2. None}, + author = {8. None}, title = {HMMER 3.1+ hmmscan search sequence(s) against a profile database}, year = {2013}, url = {http://hmmer.org},}