comparison kaks_analysis.xml @ 27:f174450ebc44 draft

Uploaded
author greg
date Fri, 28 Apr 2017 14:23:46 -0400
parents 73db26d39092
children b807167c1e60
comparison
equal deleted inserted replaced
26:73db26d39092 27:f174450ebc44
77 && mv $output_dir/*.components '$output_components' 77 && mv $output_dir/*.components '$output_components'
78 #end if 78 #end if
79 ]]> 79 ]]>
80 </command> 80 </command>
81 <inputs> 81 <inputs>
82 <param name="coding_sequences_species_1" format="fasta" type="data" label="Coding sequences (CDS) fasta file for species1" /> 82 <param name="coding_sequences_species_1" format="fasta" type="data" label="Coding sequences for the first species" />
83 <param name="proteins_species_1" format="fasta" type="data" label="Aamino acids (proteins) sequences fasta file for species1" /> 83 <param name="proteins_species_1" format="fasta" type="data" label="Protein sequences for the first species" />
84 <conditional name="comparison_cond"> 84 <conditional name="comparison_cond">
85 <param name="comparison" type="select" label="Select method for pairwise sequence comparison to determine homolgous pairs" help="Cross species comparison requires selection of inputs for second species"> 85 <param name="comparison" type="select" label="Type of sequence comparison">
86 <option value="paralogs" selected="true">Self species comparison</option> 86 <option value="paralogs" selected="true">Paralogous</option>
87 <option value="orthologs">Cross species comparison</option> 87 <option value="orthologs">Orthologous</option>
88 </param> 88 </param>
89 <when value="paralogs" /> 89 <when value="paralogs" />
90 <when value="orthologs"> 90 <when value="orthologs">
91 <param name="coding_sequences_species_2" format="fasta" type="data" label="Coding sequences (CDS) fasta file for species2" /> 91 <param name="coding_sequences_species_2" format="fasta" type="data" label="Coding sequences for the second species" />
92 <param name="proteins_species_2" format="fasta" type="data" label="Aamino acids (proteins) sequences fasta file for species2" /> 92 <param name="proteins_species_2" format="fasta" type="data" label="Protein sequences for the second species" />
93 </when> 93 </when>
94 </conditional> 94 </conditional>
95 <conditional name="options_type"> 95 <conditional name="options_type">
96 <param name="options_type_selector" type="select" label="Options Configuration"> 96 <param name="options_type_selector" type="select" label="Options Configuration">
97 <option value="basic" selected="true">Basic</option> 97 <option value="basic" selected="true">Basic</option>
98 <option value="advanced">Advanced</option> 98 <option value="advanced">Advanced</option>
99 </param> 99 </param>
100 <when value="basic" /> 100 <when value="basic" />
101 <when value="advanced"> 101 <when value="advanced">
102 <conditional name="set_min_coverage_cond"> 102 <conditional name="set_min_coverage_cond">
103 <param name="set_min_coverage" type="select" label="Specify minimum sequence pairwise coverage length between homologous pairs?"> 103 <param name="set_min_coverage" type="select" label="Alignment coverage configuration">
104 <option value="no" selected="true">No</option> 104 <option value="no" selected="true">No</option>
105 <option value="yes">Yes</option> 105 <option value="yes">Yes</option>
106 </param> 106 </param>
107 <when value="no" /> 107 <when value="no" />
108 <when value="yes"> 108 <when value="yes">
109 <param name="min_coverage" type="float" value="0.5" min="0.3" max="1.0" label="Minimum sequence pairwise coverage length between homologous pairs" /> 109 <param name="min_coverage" type="float" value="0.5" min="0.3" max="1.0" label="match score" />
110 </when> 110 </when>
111 </conditional> 111 </conditional>
112 <conditional name="recalibrate_cond"> 112 <conditional name="recalibrate_cond">
113 <param name="recalibrate" type="select" label="Specify evolutionary rate for recalibrating synonymous subsitutions (ks) of species?"> 113 <param name="recalibrate" type="select" label="Species rates recalibration configuration">
114 <option value="no" selected="true">No</option> 114 <option value="no" selected="true">No</option>
115 <option value="yes">Yes</option> 115 <option value="yes">Yes</option>
116 </param> 116 </param>
117 <when value="no" /> 117 <when value="no" />
118 <when value="yes"> 118 <when value="yes">
119 <param name="recalibration_rate" type="float" value="0.0" min="0.0" label="Evolutionary rate for recalibrating synonymous subsitutions (ks) of species" /> 119 <param name="recalibration_rate" type="float" value="0.0" min="0.0" label="Recalibration rate" />
120 </when> 120 </when>
121 </conditional> 121 </conditional>
122 <conditional name="codeml_ctl_file_cond"> 122 <conditional name="codeml_ctl_file_cond">
123 <param name="codeml_ctl_file_select" type="select" label="Select PAML codeml control file?" help="Used for ML analysis of protein-coding DNA sequences using codon substitution models, select No to use the default control file"> 123 <param name="codeml_ctl_file_select" type="select" label="PAML codeml configuration">
124 <option value="no" selected="true">No</option> 124 <option value="no" selected="true">No</option>
125 <option value="yes">Yes</option> 125 <option value="yes">Yes</option>
126 </param> 126 </param>
127 <when value="no" /> 127 <when value="no" />
128 <when value="yes"> 128 <when value="yes">
129 <param name="codeml_ctl_file" format="txt" type="data" label="PAML codeml control file" /> 129 <param name="codeml_ctl_file" format="txt" type="data" label="PAML codeml control file" />
130 </when> 130 </when>
131 </conditional> 131 </conditional>
132 <conditional name="fit_components_cond"> 132 <conditional name="fit_components_cond">
133 <param name="fit_components" type="select" label="Fit a mixture model of multivariate normal components to synonymous (ks) distribution?" help="Used to identify significant duplication events in a genome"> 133 <param name="fit_components" type="select" label="Rates clustering configuration">
134 <option value="no" selected="true">No</option> 134 <option value="no" selected="true">No</option>
135 <option value="yes">Yes</option> 135 <option value="yes">Yes</option>
136 </param> 136 </param>
137 <when value="no" /> 137 <when value="no" />
138 <when value="yes"> 138 <when value="yes">
139 <param name="num_of_components" type="integer" value="1" min="1" label="Number of components to fit to synonymous subsitutions (ks) distribution" /> 139 <param name="num_of_components" type="integer" value="1" min="1" label="Number of components" />
140 </when> 140 </when>
141 </conditional> 141 </conditional>
142 <conditional name="set_lower_ks_limit_cond"> 142 <conditional name="set_lower_ks_limit_cond">
143 <param name="set_lower_ks_limit" type="select" label="Set lower limit of synonymous subsitutions (ks)?" help="Reduces background noise from young paralogous pairs due to normal gene births and deaths in a genome"> 143 <param name="set_lower_ks_limit" type="select" label="Lower limit synonymous subsitution rates configuration">
144 <option value="no" selected="true">No</option> 144 <option value="no" selected="true">No</option>
145 <option value="yes">Yes</option> 145 <option value="yes">Yes</option>
146 </param> 146 </param>
147 <when value="no" /> 147 <when value="no" />
148 <when value="yes"> 148 <when value="yes">
149 <param name="min_ks" type="float" value="0.0" min="0.0" label="Lower limit of synonymous subsitutions (ks)" /> 149 <param name="min_ks" type="float" value="0.0" min="0.0" label="Minimum rate" />
150 </when> 150 </when>
151 </conditional> 151 </conditional>
152 <conditional name="set_upper_ks_limit_cond"> 152 <conditional name="set_upper_ks_limit_cond">
153 <param name="set_upper_ks_limit" type="select" label="Set upper limit of synonymous subsitutions (ks)?" help="Excludes likey ancient paralogous pairs"> 153 <param name="set_upper_ks_limit" type="select" label="Upper limit synonymous subsitution rates configuration">
154 <option value="no" selected="true">No</option> 154 <option value="no" selected="true">No</option>
155 <option value="yes">Yes</option> 155 <option value="yes">Yes</option>
156 </param> 156 </param>
157 <when value="no" /> 157 <when value="no" />
158 <when value="yes"> 158 <when value="yes">
159 <param name="max_ks" type="float" value="0.0" min="0.0" label="Upper limit of synonymous subsitutions (ks)" /> 159 <param name="max_ks" type="float" value="0.0" min="0.0" label="Maximum rate" />
160 </when> 160 </when>
161 </conditional> 161 </conditional>
162 </when> 162 </when>
163 </conditional> 163 </conditional>
164 <!-- Required due to the Emmix license --> 164 <!-- Required due to the Emmix license -->
193 <tests> 193 <tests>
194 <test> 194 <test>
195 </test> 195 </test>
196 </tests> 196 </tests>
197 <help> 197 <help>
198 This tool is one of the PlantTribes collection of automated modular analysis pipelines that utilize objective classifications of 198 This tool is one of the PlantTribes collection of automated modular analysis pipelines for comparative and evolutionary analyses
199 complete protein sequences from sequenced plant genomes to perform comparative evolutionary studies. This tool performs orthologous 199 of genome-scale gene families and transcriptomes. This tool estimates paralogous and orthologous pairwise synonymous (Ks) and
200 or paralogous ks analyses of coding sequences and amino acid sequences. 200 non-synonymous (Ka) substitution rates for a set of gene coding sequences either produced by the AssemblyPostProcessor tool or
201 from an external source. Optionally, the resulting set of estimated Ks values can be clustered into components using a mixture
202 of multivariate normal distributions to identify significant duplication event(s) in a species or a pair of species.
201 203
202 ----- 204 -----
203 205
204 **Options** 206 **Options**
205 207
206 * **Required** 208 * **Required**
207 209
208 - **Coding sequences (CDS) fasta file for species1** - Coding sequences (CDS) fasta file for species1. 210 - **Coding sequences for the first species** - coding sequence fasta file for for the first species either produced by the AssemblyPostProcessor tool or an external source selected from your history.
209 - **Aamino acids (proteins) sequences fasta file for species1** - Aamino acids (proteins) sequences fasta file for species1 211 - **Protein sequences for the first species** - corresponding protein sequence fasta files for the first species either produced by the AssemblyPostProcessor tool or an external source selected from your history.
210 - **Select method for pairwise sequence comparison to determine homolgous pairs** - Pairwise sequence comparison to determine homolgous pairs (cross species comparison requires selection of inputs for species2). 212 - **Type of sequence comparison** - pairwise sequence comparison to determine homolgous pairs. This can be either paralogous for self-species comparison or orthologous for cross-species comparison. Cross species comparision requires data selected for the second species.
211 213
212 * **Optional** 214 * **Optional**
213 215
214 - **Minimum sequence pairwise coverage length between homologous pairs** - Minimum sequence pairwise coverage length between homologous pairs (e.g., 0.5 results in 50% coverage. Legal values lie between 0.3 and 1.0. 216 - **Coding sequences for the second species** - coding sequence fasta file for for the second species either produced by the AssemblyPostProcessor tool or an external source selected from your history. Required only for orthologous comparison.
215 - **Evolutionary rate for recalibrating synonymous subsitutions (ks) of species** - (applies to paralogous ks analysis) Recalibrate synonymous subsitutions (ks) of species using a predetermined evoutionary rate that can be determined from a species tree inferred from a collection single copy genes from taxa of interest (Cui et al., 2006). 217 - **Protein sequences for the second species** - corresponding protein sequence fasta files for the second species either produced by the AssemblyPostProcessor tool or an external source selected from your history. Required only for orthologous comparison.
216 - **Select PAML codeml control file?** - Select PAML's codeml control file from your history. This file is used to to perfom ML analysis of protein-coding DNA sequences using codon substitution models. Selecting No uses the default file which does not include input (seqfile, treefile) and output (outfile) parameters of codeml. 218 - **Alignment coverage configuration** - select 'Yes' to set the minimum allowable alignment coverage length between homologous pairs. PlantTribes uses global codon alignment match score to determine the pairwise alignment coverage. By default, the match score is set to 0.5 if 'No' is selected.
217 - **Fit a mixture model of multivariate normal components to synonymous (ks) distribution?** - Fit a mixture model of multivariate normal components to synonymous (ks) distribution to identify significant duplication event(s) in a genome. 219
218 - **Number components to fit to synonymous subsitutions (ks) distribution** - Number components to fit to synonymous subsitutions (ks) distribution. 220 - **match score** - number of base matches in a pairwise sequence alignment divided by the length of shorter sequence. Positions in the alignment corresponding to gaps are not considered. The score is restricted to the range 0.3 - 1.0.
219 - **Lower limit of synonymous subsitutions (ks)** - Lower limit of synonymous subsitutions (ks) - necessary if fitting components to the distribution to reduce background noise from young paralogous pairs due to normal gene births and deaths in a genome. 221
220 - **Upper limit of synonymous subsitutions (ks)** - Upper limit of synonymous subsitutions (ks) - necessary if fitting components to the distribution to exclude likey ancient paralogous pairs. 222 - **Species rates recalibration configuration** - select 'Yes' to recalibrate synonymous substitution rates of a species using a predetermined evolutionary rate. Recalibration evolutionary rate can be determined from a species tree inferred from a collection of conserved single copy genes from taxa of interest as described in Cui et al., 2006. Applies only to paralogous comparisons.
223
224 - **recalibration rate** - a predetermined evolutionary recalibration rate.
225
226 - **PAML codeml configuration** - select 'Yes' to enable selection of a PAML codeml control file to carry out maximum likelihood analysis of protein-coding DNA sequences using codon substitution models. Template file "codeml.ctl.args" can be found in the scaffold data installed into Galaxy via the PlantTribes Scaffolds Download Data Manager tool, and are also available at the PlantTribes GitHub `repository`_. Default settings shown in the template are used if 'No' is selected.
227
228 .. _repository: https://github.com/dePamphilis/PlantTribes/blob/master/config/codeml.ctl.args
229
230 - **Rates clustering configuration** - select 'Yes' to estimate clusters of synonymous substitution rates using a mixture of multivariate normal distributions which represent putative duplication event(s).
231
232 - **Number of components** - number of components to include in the normal mixture model.
233
234 - **Lower limit synonymous subsitution rates configuration** - select 'Yes' to set the minimum allowable synonymous substitution rate to use in the normal mixtures cluster analysis to exclude young paralogs that arise from normal gene births and deaths in a genome.
235
236 - **Minimum rate** - minimum allowable synonymous substitution rate.
237
238 - **Upper limit synonymous subsitution rates configuration** - select 'Yes' to set the maximum allowable synonymous substitution rate to use in the normal mixtures cluster analysis to exclude likely ancient paralogs in a genome.
239
240
241 - **Maximum rate** - maximum allowable synonymous substitution rate.
221 242
222 </help> 243 </help>
223 <citations> 244 <citations>
224 <expand macro="citation1" /> 245 <expand macro="citation1" />
225 <citation type="doi">10.1093/bioinformatics/btw412</citation> 246 <citation type="bibtex">
226 <citation type="doi">10.1186/1471-2105-10-421</citation> 247 @article{Wall2008,
227 <citation type="doi">10.1093/molbev/msm088</citation> 248 journal = {Nucleic Acids Research},
228 <citation type="doi">10.18637/jss.v004.i02</citation> 249 author = {2. Wall PK, Leebens-Mack J, Muller KF, Field D, Altman NS},
250 title = {PlantTribes: a gene and gene family resource for comparative genomics in plants},
251 year = {2008},
252 volume = {36},
253 number = {suppl 1},
254 pages = {D970-D976},}
255 </citation>
256 <citation type="bibtex">
257 @article{Altschul1990,
258 journal = {Journal of molecular biology}
259 author = {3. Altschul SF, Gish W, Miller W, Myers EW, Lipman DJ},
260 title = {Basic local alignment search tool},
261 year = {1990},
262 volume = {215},
263 number = {3},
264 pages = {403-410},}
265 </citation>
266 <citation type="bibtex">
267 @article{Katoh2013,
268 journal = {Molecular biology and evolution},
269 author = {4. Katoh K, Standley DM},
270 title = {MAFFT multiple sequence alignment software version 7: improvements in performance and usability},
271 year = {2013},
272 volume = {30},
273 number = {4},
274 pages = {772-780},}
275 </citation>
276 <citation type="bibtex">
277 @article{Yang2007,
278 journal = {Molecular biology and evolution},
279 author = {5. Yang Z},
280 title = {PAML 4: phylogenetic analysis by maximum likelihood},
281 year = {2007},
282 volume = {24},
283 number = {8},
284 pages = {1586-1591},}
285 </citation>
286 <citation type="bibtex">
287 @article{McLachlan1999,
288 journal = {Journal of Statistical Software},
289 author = {6. McLachlan GJ, Peel D, Basford KE, Adams P},
290 title = {The EMMIX software for the fitting of mixtures of normal and t-components},
291 year = {1999},
292 volume = {4},
293 number = {2},
294 pages = {1-14},}
295 </citation>
229 </citations> 296 </citations>
230 </tool> 297 </tool>