Mercurial > repos > greg > gene_family_classifier
changeset 121:cf5a7194d866 draft
Uploaded
author | greg |
---|---|
date | Tue, 21 Mar 2017 10:30:35 -0400 |
parents | e64f6cdfa6b3 |
children | 78d2a8b63c13 |
files | gene_family_classifier.xml |
diffstat | 1 files changed, 117 insertions(+), 43 deletions(-) [+] |
line wrap: on
line diff
--- a/gene_family_classifier.xml Mon Mar 20 14:28:59 2017 -0400 +++ b/gene_family_classifier.xml Tue Mar 21 10:30:35 2017 -0400 @@ -107,8 +107,8 @@ ]]> </command> <inputs> - <param name="input" format="fasta" type="data" label="Amino acids (proteins) sequences fasta file"/> - <param name="scaffold" type="select" label="Orthogroups or gene families proteins scaffold"> + <param name="input" format="fasta" type="data" label="Proteins fasta file"/> + <param name="scaffold" type="select" label="Gene family scaffold"> <options from_data_table="plant_tribes_scaffolds" /> <validator type="no_options" message="No PlantTribes scaffolds are available. Use the PlantTribes Scaffolds Download Data Manager tool in Galaxy to install and populate the PlantTribes scaffolds data table."/> </param> @@ -118,91 +118,90 @@ <option value="orthomcl">OrthoMCL</option> </param> <conditional name="save_hmmscan_log_cond"> - <param name="classifier" type="select" label="Protein classification method"> + <param name="classifier" type="select" label="Protein classifier"> <option value="blastp" selected="true">blastp</option> <option value="hmmscan">HMMScan</option> <option value="both">Both blastp and HMMScan</option> </param> <when value="blastp" /> <when value="hmmscan"> - <param name="save_hmmscan_log" type="select" label="Save hmmscan log?" help="Save the hmmscan log in an additional output dataset"> + <param name="save_hmmscan_log" type="select" label="Save hmmscan log?"> <option value="no" selected="true">No</option> <option value="yes">Yes</option> </param> </when> <when value="both"> - <param name="save_hmmscan_log" type="select" label="Save hmmscan log?" help="Save the hmmscan log in an additional output dataset"> + <param name="save_hmmscan_log" type="select" label="Save hmmscan log?"> <option value="no" selected="true">No</option> <option value="yes">Yes</option> </param> </when> </conditional> <conditional name="options_type"> - <param name="options_type_selector" type="select" label="Options Configuration"> + <param name="options_type_selector" type="select" label="Options configuration"> <option value="basic" selected="true">Basic</option> <option value="advanced">Advanced</option> </param> <when value="basic" /> <when value="advanced"> <conditional name="specify_super_orthogroups_cond"> - <param name="specify_super_orthogroups" type="select" label="Specify super orthogroups?" help="Secondary MCL clusters of orthogroups"> + <param name="specify_super_orthogroups" type="select" label="Super orthogroups configuration"> <option value="no" selected="true">No</option> <option value="yes">Yes</option> </param> <when value="no"/> <when value="yes"> - <param name="super_orthogroups" type="select" label="Super orthogroups clustering specification"> - <option value="min_evalue" selected="true">Minimum e-value</option> - <option value="avg_evalue">Average e-value</option> + <param name="super_orthogroups" type="select" label="Clustering distance measure"> + <option value="min_evalue" selected="true">blastp e-value</option> </param> </when> </conditional> <conditional name="specify_single_copy_cond"> - <param name="specify_single_copy" type="select" label="Specify single copy orthogroup selection?"> + <param name="specify_single_copy" type="select" label="Single copy orthogroups configuration"> <option value="no" selected="true">No</option> <option value="yes">Yes</option> </param> <when value="no"/> <when value="yes"> <conditional name="single_copy_cond"> - <param name="single_copy" type="select" label="Select single copy orthogroup configuration option"> - <option value="custom" selected="true">Single copy orthogroup custom configuration</option> - <option value="taxa">Minimum single copy taxa required in orthogroup</option> + <param name="single_copy" type="select" label="Selection criterion"> + <option value="custom" selected="true">Custom selection</option> + <option value="taxa">Global selection</option> </param> <when value="custom"> <conditional name="single_copy_custom_cond"> - <param name="single_copy_custom" type="select" label="Select single copy orthogroup custom configuration from the current history?" help="Select No to use the default configuration"> + <param name="single_copy_custom" type="select" label="Custom selection configuration"> <option value="no" selected="true">No</option> <option value="yes">Yes</option> </param> <when value="no"/> <when value="yes"> - <param name="single_copy_custom_config" format="txt" type="data" label="Single copy orthogroup custom configuration file"/> + <param name="single_copy_custom_config" format="txt" type="data" label="Custom selection file"/> </when> </conditional> </when> <when value="taxa"> - <param name="single_copy_taxa" type="integer" value="20" label="Minimum single copy taxa required in orthogroup"/> - <param name="taxa_present" type="integer" value="21" label="Minimum taxa required in single copy orthogroup"/> + <param name="single_copy_taxa" type="integer" value="20" label="Minimum single copy taxa"/> + <param name="taxa_present" type="integer" value="21" label="Minimum taxa present"/> </when> </conditional> </when> </conditional> <conditional name="create_orthogroup_cond"> - <param name="create_orthogroup" type="select" label="Create orthogroup fasta files?"> + <param name="create_orthogroup" type="select" label="Orthogroups fasta configuration"> <option value="no" selected="true">No</option> <option value="yes">Yes</option> </param> <when value="no" /> <when value="yes"> <conditional name="create_corresponding_coding_sequences_cond"> - <param name="create_corresponding_coding_sequences" type="select" label="Create corresponding coding sequences?"> + <param name="create_corresponding_coding_sequences" type="select" label="Orthogroups coding sequences"> <option value="no" selected="true">No</option> <option value="yes">Yes</option> </param> <when value="no" /> <when value="yes"> - <param name="coding_sequences" format="fasta" type="data" label="Corresponding coding sequences (CDS) fasta file"/> + <param name="coding_sequences" format="fasta" type="data" label="Coding sequences fasta file"/> </when> </conditional> </when> @@ -247,31 +246,48 @@ </test> </tests> <help> -This tool is one of the PlantTribes collection of automated modular analysis pipelines that utilize objective classifications of -complete protein sequences from sequenced plant genomes to perform comparative evolutionary studies. This tool classifies gene -sequences into precomputed orthologous gene family clusters using either blastp (faster), HMMScan (slower but more sensitive -to remote homologs) or both (more exhaustive). - -This tool accepts any of the following as input: - -* the postprocessed assemblies produced by the **Postprocess de novo assembly transcripts into putative coding sequences** tool -* externally predicted coding sequences and their corresponding amino acid translations derived from a transcriptome assembly -* gene predictions from a sequenced genome +This tool is one of the PlantTribes collection of automated modular analysis pipelines for comparative and evolutionary +analyses of genome-scale gene families and transcriptomes. This tool classifies gene coding sequences either produced by +the AssemblyPostProcessor tool or from an external source into pre-computed orthologous gene family clusters (orthogroups) +of a PlantTribes scaffold. Classified sequences are then assigned with the corresponding orthogroups’ metadata that includes +gene counts of backbone taxa, super clusters (super orthogoups) at multiple stringencies, and functional annotations from +sources such as Gene Ontology (GO), InterPro protein domains, and UniProt KB/Swiss-Prot. Additionally, sequences belonging +to single/low-copy gene families that are mainly utilized in species tree inference can be determined. ----- -**Options** +**Required options** + + * **Proteins fasta file** - proteins fasta file either produced by the AssemblyPostProcessor tool or an external source selected from your history. + * **Gene family scaffold** - one of the PlantTribes gene family scaffolds [2-4] installed into Galaxy by the PlantTribes Scaffold Data Manager tool. + * **Protein clustering method** - gene family scaffold protein clustering method as described in the AssemblyPostProcessor tool. + * **Protein classifier** - Classifier to assign protein sequences into a specified scaffold orthogroups. PlantTribes implements three classification approaches; blastp (faster)[5], hmmscan (slower but more sensitive to the remote homologs)[6], and both blastp and hmmscan (more exhaustive). + +**Other options** + + * **Super orthogroups configuration** - select ‘Yes’ to enable super orthogroups configuration options. Super orthogroups are constructed through a second iteration of MCL clustering to connect distant, but potentially related orthogroup clusters. + + * **Clustering distance measure** - distance measure used in merging orthogroup clusters into super orthogroup clusters. PlantTribes pre-computed super orthogroups are based on the minimum and average blastp e-value between all pairs of scaffold orthogroups used as the input matrix for MCL clustering algorithm[7]. + + * **Single copy orthogroups configuration** - select ‘Yes’ to enable single/low-copy orthogroups selection configuration options. - * **Orthogroups or gene families proteins scaffold** - PlantTribes scaffolds data installed into Galaxy by the PlantTribes Scaffolds Download Data Manager tool. - * **Protein clustering method** - One of GFam (domain architecture based clustering), OrthoFinder (broadly defined clusters) or OrthoMCL (narrowly defined clusters). - * **Protein classification method** - blastp (faster), HMMScan (slower but more sensative to the remote homologs) or both (more exhaustive). - * **Super Orthogroups** - Secondary MCL clusters of orthogroups. - * **Specify single copy orthogroup selection?** - Specify a single copy orthogroup custom configuration or the minimum single copy taxa required in the orthogroup. - * **Select single copy orthogroup custom configuration from the current history?** - If a custom configuration is chosen, the configuration can be selected from the current history or the default configuration can be used. - * **Minimum single copy taxa required in orthogroup** - Used with "Minimum single copy taxa required in orthogroup" configuration only. - * **Minimum taxa required in single copy orthogroup** - Used with "Minimum single copy taxa required in orthogroup" configuration only. - * **Corresponding coding sequences (CDS) fasta file** - Used only when selecting "Create orthogroup fasta files?". - + * **Selection criterion** - single/low-copy orthogroups selection criterion. PlantTribes provides custom and global selection criteria for selecting user defined single/low-copy scaffold orthogoups. + + * **Custom selection configuration** - select ‘Yes’ to enable selection of a single copy configuration file. Scaffold configuration templates(.singleCopy.config) of how to customize single/low-copy orthogroups selection can be found in the scaffold data installed into Galaxy via the PlantTribes Scaffolds Download Data Manager tool, and also available at the PlantTribes GitHub repository (https://github.com/dePamphilis/PlantTribes/config ). Single/low-copy settings shown in these templates are used as defaults if ‘No’ is selected. + + * **Custom selection file** - select a single/low-copy customized configuration file from your history. + + * **Global selection configuration** - Used with "Global selection" configuration only. + + * **Minimum single copy taxa** - Minimum number of taxa with single copy genes in the orthogroup. + * **Minimum taxa present** - Minimum number of taxa present in the orthogroup. + + * **Orthogroups fasta configuration** - select ‘Yes’ to create proteins orthogroups fasta files for the classified sequences. + + * **Orthogroups coding sequences** - select ‘Yes’ to create corresponding coding sequences orthogroups fasta files for the classified protein sequences. Requires coding sequences fasta file corresponding proteins fasta file to be selected from your history. + + * **Coding sequences fasta file** - select coding sequences fasta file corresponding to the proteins fasta file from your history. + </help> <citations> <citation type="bibtex"> @@ -283,9 +299,67 @@ url = {https://github.com/dePamphilis/PlantTribes},} </citation> <citation type="bibtex"> + @article{Sasidharan2012, + journal = {Nucleic Acids Research}, + author = {2. Sasidharan R, Nepusz T, Swarbreck D, Huala E, Paccanaro A}, + title = {GFam: a platform for automatic annotation of gene families}, + year = {2012}, + pages = {gks631},} + </citation> + <citation type="bibtex"> + @article{Li2003, + journal = {Genome Research} + author = {3. Li L, Stoeckert CJ, Roos DS}, + title = {OrthoMCL: identification of ortholog groups for eukaryotic genomes}, + year = {2003}, + volume = {13}, + number = {9}, + pages = {2178-2189},} + </citation> + <citation type="bibtex"> + @article{Emms2015, + journal = {Genome Biology} + author = {4. Emms DM, Kelly S}, + title = {OrthoFinder: solving fundamental biases in whole genome comparisons dramatically improves orthogroup inference accuracy}, + year = {2015}, + volume = {16}, + number = {1}, + pages = {157},} + </citation> + <citation type="bibtex"> + @article{Altschul1990, + journal = {Journal of molecular biology} + author = {5. Altschul SF, Gish W, Miller W, Myers EW, Lipman DJ}, + title = {Basic local alignment search tool}, + year = {1990}, + volume = {215}, + number = {3}, + pages = {403-410},} + </citation> + <citation type="bibtex"> + @article{Eddy2009, + journal = {Genome Inform}, + author = {6. Eddy SR}, + title = {A new generation of homology search tools based on probabilistic inference}, + year = {2009}, + volume = {23}, + number = {1}, + pages = {205-211},} + </citation> + <citation type="bibtex"> + @article{Enright2002, + journal = {Nucleic acids research}, + author = {7. Enright AJ, Van Dongen S, Ouzounis CA}, + title = {n efficient algorithm for large-scale detection of protein families}, + year = {2002}, + volume = {30}, + number = {7}, + pages = {1575-1584},} + </citation> + <citation type="bibtex"> @article{None, journal = {GitHub repository}, - author = {2. None}, + author = {8. None}, title = {HMMER 3.1+ hmmscan search sequence(s) against a profile database}, year = {2013}, url = {http://hmmer.org},}