comparison macros.xml @ 2:7de8436f7f97 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/drep commit 5e6e589002d554be180e575080e9ad66cc78ed74"
author iuc
date Sat, 12 Feb 2022 17:38:03 +0000
parents a5054beafded
children 50de1050e717
comparison
equal deleted inserted replaced
1:a5054beafded 2:7de8436f7f97
1 <?xml version="1.0"?>
1 <macros> 2 <macros>
2 <token name="@VERSION@">2.5.4</token> 3 <token name="@TOOL_VERSION@">3.2.2</token>
4 <token name="@VERSION_SUFFIX@">0</token>
5 <token name="@PROFILE@">20.01</token>
6 <xml name="biotools">
7 <xrefs>
8 <xref type="bio.tools">drep</xref>
9 </xrefs>
10 </xml>
3 <xml name="requirements"> 11 <xml name="requirements">
4 <requirements> 12 <requirements>
5 <requirement type="package" version="@VERSION@">drep</requirement> 13 <requirement type="package" version="@TOOL_VERSION@">drep</requirement>
6 <yield/> 14 <yield/>
7 </requirements> 15 </requirements>
8 </xml> 16 </xml>
9 <xml name="citations"> 17 <xml name="citations">
10 <citations> 18 <citations>
11 <citation type="doi">10.1038/ismej.2017.126</citation> 19 <citation type="doi">10.1038/ismej.2017.126</citation>
12 <yield /> 20 <yield />
13 </citations> 21 </citations>
14 </xml> 22 </xml>
15 <xml name="bio_tools">
16 <xrefs>
17 <xref type="bio.tools">drep</xref>
18 </xrefs>
19 </xml>
20 23
21 <xml name="genomes"> 24 <xml name="genomes">
22 <param argument="--genomes" type="data" format="fasta" label="genomes fasta files" multiple="true"/> 25 <param argument="--genomes" type="data" format="fasta" multiple="true" label="Genomes to filer"/>
23 </xml> 26 </xml>
27
28 <!-- Addition of ".fasta" after names to avoid string to be read as integer
29 Bug in dRep: probably fixed in next version -->
24 <token name="@PREPARE_GENOMES@"><![CDATA[ 30 <token name="@PREPARE_GENOMES@"><![CDATA[
25 #import re 31 #import re
26 #set $genomefiles = [] 32 #set $genomefiles = []
27 #for $genome in $genomes 33 #for $genome in $genomes
28 #set $input_name = $re.sub('[^\w\-_.]', '_',str($genome.element_identifier.split('/')[-1])) 34 #set $input_name = $re.sub('[^\w\-_.]', '_',str($genome.element_identifier.split('/')[-1]))
29 ln -s '${genome}' '${input_name}' && 35 ln -s '${genome}' '${input_name}.fasta' &&
30 $genomefiles.append($input_name) 36 $genomefiles.append($input_name)
31 #end for 37 #end for
32 ]]></token> 38 ]]></token>
33 <token name="@GENOMES@"><![CDATA[ 39 <token name="@GENOMES@"><![CDATA[
34 -g 40 -g
35 #for $genomefile in $genomefiles 41 #for $genomefile in $genomefiles
36 '${genomefile}' 42 '${genomefile}.fasta'
37 #end for 43 #end for
38 ]]></token> 44 ]]></token>
39
40
41 <xml name="checkm_method">
42 <param argument="--checkM_method" type="select" label="checkm method" optional="true">
43 <option value="taxonomy_wf">taxonomy_wf (faster)</option>
44 <option value="lineage_wf">lineage_wf (more accurate)</option>
45 </param>
46 </xml>
47 <token name="@CHECKM_METHOD@"><![CDATA[
48 #if $checkM_method:
49 --checkM_method $checkM_method
50 #end if
51 ]]></token>
52 45
53 <xml name="filtering_options"> 46 <xml name="filtering_options">
54 <conditional name="filter"> 47 <section name="filter" title="Genome filtering" expanded="true">
55 <param name="set_options" type="select" label="set filtering options"> 48 <param argument="--length" type="integer" value="50000" label="Minimum genome length"/>
56 <option value="yes">Yes</option> 49 <param argument="--completeness" type="integer" value="75" min="0" max="100" label="Minimum genome completeness percent"/>
57 <option value="no" selected="true">No (use --checkM_method taxonomy_wf)</option> 50 <param argument="--contamination" type="integer" value="25" min="0" max="100" label="Maximum genome contamination percent"/>
51 </section>
52 </xml>
53 <xml name="test_default_filtering_options">
54 <section name="filter">
55 <param name="length" value="50000"/>
56 <param name="completeness" value="75"/>
57 <param name="contamination" value="100"/>
58 </section>
59 </xml>
60 <token name="@FILTER_OPTIONS@"><![CDATA[
61 --length $filter.length
62 --completeness $filter.completeness
63 --contamination $filter.contamination
64 ]]></token>
65
66 <xml name="quality_assessment_options">
67 <conditional name="quality">
68 <param name="source" type="select" label="Genome quality filtering" help="No checkM or quality filtering is not recommened but with bacteriophages or eukaryotes or things where checkM scoring does not work. Will only choose genomes based on length and N50.">
69 <option value="checkm" selected="true">Run checkM</option>
70 <option value="genomeInfo">Provide quality information on the genome (CSV file)</option>
71 <option value="ignoreGenomeQuality">Don't run checkM or do any quality filtering (--ignoreGenomeQuality) - NOT RECOMMENDED!</option>
58 </param> 72 </param>
59 <when value="yes"> 73 <when value="checkm">
60 <param argument="--length" type="integer" value="50000" label="Minimum genome length"/> 74 <param argument="--checkM_method" type="select" label="CheckM method">
61 <param argument="--completeness" type="integer" value="75" min="0" max="100" label="Minimum genome completeness percent"/> 75 <option value="lineage_wf" selected="true">lineage_wf: Lineage-specific Workflow - quality estimates with lineage-specific markers (more accurate)</option>
62 <param argument="--contamination" type="integer" value="25" min="0" max="100" label="Maximum genome contamination percent"/> 76 <option value="taxonomy_wf">taxonomy_wf: Taxonomic-specific Workflow - quality estimates with taxonomic-specific markers (faster)</option>
63 77 </param>
64 <conditional name="quality"> 78 <param argument="--set_recursion" type="integer" optional="true" label="Increases the python recursion limit" help="NOT RECOMMENDED unless checkM is crashing due to recursion issues. Recommended to set to 2000 if needed, but setting this could crash Python"/>
65 <param argument="source" type="select" label="genome quality"> 79 <param argument="--checkm_group_size" type="integer" value="2000" min="1" label="Number of genomes passed to checkM at a time" help="Increasing this increases RAM but makes checkM faster"/>
66 <help>
67 --ignoreGenomeQuality is useful with
68 bacteriophages or eukaryotes or things where checkM
69 scoring does not work. Will only choose genomes based
70 on length and N50.
71 </help>
72 <option value="checkm" selected="true">Run checkM</option>
73 <option value="genomeInfo">User supplied genomeInfo csv file</option>
74 <option value="ignoreGenomeQuality">--ignoreGenomeQuality (NOT RECOMMENDED!)</option>
75 </param>
76 <when value="checkm">
77 <param argument="--checkM_method" type="select" label="checkm method" optional="true">
78 <help>
79 Using the checkm method of lineage_wf can require more than 40Gb of RAM.
80 </help>
81 <option value="taxonomy_wf">taxonomy_wf (faster)</option>
82 <option value="lineage_wf">lineage_wf (more accurate)</option>
83 </param>
84 </when>
85 <when value="genomeInfo">
86 <param argument="--genomeInfo" type="data" format="csv" label="genomes fasta files">
87 <help><![CDATA[
88 A CSV dataset that must contain: [
89 "genome"(history dataset name of .fasta dataset of that genome),
90 "completeness"(0-100 value for completeness of the genome),
91 "contamination"(0-100 value of the contamination of the genome)]
92 ]]></help>
93 </param>
94 </when>
95 <when value="ignoreGenomeQuality"/>
96 </conditional>
97 </when> 80 </when>
98 <when value="no"/> 81 <when value="genomeInfo">
99 </conditional> 82 <param argument="--genomeInfo" type="data" format="csv" label="Quality information on the genomes">
100 </xml> 83 <help><![CDATA[
101 <token name="@FILTER_OPTIONS@"><![CDATA[ 84 A CSV dataset that must contain: [
102 #if $filter.set_options == 'yes': 85 "genome"(history dataset name of .fasta dataset of that genome),
103 --length $filter.length 86 "completeness"(0-100 value for completeness of the genome),
104 --completeness $filter.completeness 87 "contamination"(0-100 value of the contamination of the genome)]
105 --contamination $filter.contamination 88 ]]></help>
106 #if $filter.quality.source == 'checkm'
107 --checkM_method $filter.quality.checkM_method
108 #elif $filter.quality.source == 'genomeInfo'
109 --genomeInfo $filter.quality.genomeInfo
110 #elif $filter.quality.source == 'ignoreGenomeQuality'
111 --ignoreGenomeQuality
112 #end if
113 #else
114 --checkM_method taxonomy_wf
115 #end if
116 ]]></token>
117
118 <xml name="genome_comparison_options">
119 <conditional name="genome_comparison">
120 <param name="set_options" type="select" label="set genome comparison options">
121 <option value="yes">Yes</option>
122 <option value="no" selected="true">No</option>
123 </param>
124 <when value="yes">
125 <param argument="--MASH_sketch" type="integer" value="1000" label="MASH sketch size"/>
126 <param argument="--S_algorithm" type="select" label="Algorithm for secondary clustering comaprisons">
127 <option value="ANImf" selected="true">ANImf = (RECOMMENDED) Align whole genomes with nucmer; filter alignment; compare aligned regions</option>
128 <option value="ANIn">ANIn = Align whole genomes with nucmer; compare aligned regions</option>
129 <option value="gANI">gANI = Identify and align ORFs; compare aligned ORFS</option>
130 </param>
131 <param argument="-n_PRESET" type="select" label="Presets to pass to nucmer">
132 <option value="normal" selected="true">normal = default ANIn parameters (default: normal)</option>
133 <option value="tight">tight = only align highly conserved regions</option>
134 </param> 89 </param>
135 </when> 90 </when>
136 <when value="no"/> 91 <when value="ignoreGenomeQuality"/>
137 </conditional> 92 </conditional>
138 </xml> 93 </xml>
139 <token name="@GENOME_COMPARISON_OPTIONS@"><![CDATA[ 94 <xml name="test_default_quality_assessment_options">
140 #if $genome_comparison.set_options == 'yes': 95 <conditional name="quality">
141 --MASH_sketch $genome_comparison.MASH_sketch 96 <param name="source" value="checkm"/>
142 --S_algorithm $genome_comparison.S_algorithm 97 <param name="checkM_method" value="taxonomy_wf"/>
143 -n_PRESET $genome_comparison.n_PRESET 98 <param name="checkm_group_size" value="2000"/>
144 #end if 99 </conditional>
145 ]]></token> 100 </xml>
146 101 <token name="@QUALITY_ASSESSMENT_OPTIONS@"><![CDATA[
147 <xml name="clustering_options"> 102 #if $quality.source == 'checkm'
103 --checkM_method '$quality.checkM_method'
104 #if str($quality.set_recursion) != ''
105 --set_recurison $filter.set_recursion
106 #end if
107 --checkm_group_size $quality.checkm_group_size
108 #else if $quality.source == 'genomeInfo'
109 --genomeInfo '$quality.genomeInfo'
110 #else if $quality.source == 'ignoreGenomeQuality'
111 --ignoreGenomeQuality
112 #end if
113 ]]></token>
114
115 <xml name="mash">
116 <param argument="--MASH_sketch" type="integer" value="1000" min="0" label="MASH sketch size"/>
117 <param argument="--P_ani" type="float" value="0.9" min="0." max="1." label="ANI threshold to form primary clusters"/>
118 <param argument="--multiround_primary_clustering" type='boolean' checked="false" truevalue='--multiround_primary_clustering' falsevalue='' label="Cluster each primary clunk separately and merge at the end with single linkage?" help="Decreases RAM usage and increases speed, and the cost of a minor loss in precision and the inability to plot primary_clustering_dendrograms. Especially helpful when clustering 5000+ genomes. Will be done with single linkage clustering"/>
119 <param argument="--primary_chunksize" type="integer" value="5000" min="1" label="Impacts multiround_primary_clusterings" help=" If you have more than this many genomes, process them in chunks of this size"/>
120 </xml>
121 <xml name="test_default_mash">
122 <param name="MASH_sketch" value="1000"/>
123 <param name="P_ani" value="0.9"/>
124 <param name="multiround_primary_clustering" value=''/>
125 <param name="primary_chunksize" value="5000"/>
126 </xml>
127 <token name="@MASH@"><![CDATA[
128 --MASH_sketch '$comp_clust.steps.MASH_sketch'
129 --P_ani $comp_clust.steps.P_ani
130 $comp_clust.steps.multiround_primary_clustering
131 --primary_chunksize $comp_clust.steps.primary_chunksize
132 ]]></token>
133
134 <xml name="nucmer">
135 <param argument="--n_PRESET" type="select" label="Presets to pass to nucmer">
136 <option value="normal" selected="true">normal: default ANIn parameters</option>
137 <option value="tight">tight: only align highly conserved regions</option>
138 </param>
139 </xml>
140 <xml name="test_default_nucmer">
141 <param name="n_PRESET" value="normal"/>
142 </xml>
143 <token name="@NUCMER@"><![CDATA[
144 --n_PRESET '$comp_clust.steps.clustering.n_PRESET'
145 ]]></token>
146
147 <xml name="coverage_method">
148 <param argument="--coverage_method" type="select" label="Method to calculate coverage of an alignment">
149 <option value="larger" selected="true">Larger = max((aligned length / genome 1), (aligned_length / genome2))</option>
150 <option value="total">Total = 2*(aligned length) / (sum of total genome lengths)</option>
151 </param>
152 </xml>
153 <xml name="test_default_coverage_method">
154 <param name="coverage_method" value="larger"/>
155 </xml>
156 <token name="@COVERAGE_METHOD@"><![CDATA[
157 --coverage_method '$comp_clust.steps.clustering.coverage_method'
158 ]]></token>
159
160 <xml name="secondary_clustering">
148 <conditional name="clustering"> 161 <conditional name="clustering">
149 <param name="set_options" type="select" label="set clustering options"> 162 <param argument="--S_algorithm" type="select" label="Algorithm for secondary clustering comparisons">
150 <option value="yes">Yes</option> 163 <option value="fastANI">fastANI: Kmer-based approach - very fast</option>
151 <option value="no" selected="true">No</option> 164 <option value="ANImf" selected="true">ANImf: Align whole genomes with nucmer; filter alignment; compare aligned regions - RECOMMENDED</option>
165 <option value="ANIn">ANIn: Align whole genomes with nucmer; compare aligned regions</option>
166 <option value="gANI">gANI: Identify and align ORFs; compare aligned ORFS</option>
167 <option value="goANI">Open source version of gANI; requires nsmimscan</option>
152 </param> 168 </param>
153 <when value="yes"> 169 <when value="fastANI">
154 <param argument="--P_ani" type="float" value="0.9" min="0." max="1." label="ANI threshold to form primary (MASH) clusters"/> 170 <param argument="--greedy_secondary_clustering" type='boolean' checked="false" truevalue='--greedy_secondary_clustering' falsevalue='' label="Use a heuristic to avoid pair-wise comparisons when doing secondary clustering?" help="Will be done with single linkage clustering"/>
155 <param argument="--S_ani" type="float" value="0.99" min="0." max="1." label="ANI threshold to form secondary clusters"/> 171 </when>
156 172 <when value="ANImf">
157 <param argument="--SkipMash" type="boolean" truevalue="--SkipMash" falsevalue="" checked="false" label="Skip MASH clustering, just do secondary clustering on all genomes"/> 173 <expand macro="nucmer"/>
158 <param argument="--SkipSecondary" type="boolean" truevalue="--SkipSecondary" falsevalue="" checked="false" label="Skip secondary clustering, just perform MASH clustering"/> 174 <expand macro="coverage_method"/>
159 <param argument="--cov_thresh" type="float" value="0.1" min="0." max="1." label="Minmum level of overlap between genomes when doing secondary comparisons"/> 175 </when>
160 <param argument="--coverage_method" type="select" label="Method to calculate coverage of an alignment"> 176 <when value="ANIn">
161 <help>(for ANIn/ANImf only; gANI can only do larger method)</help> 177 <expand macro="nucmer"/>
162 <option value="larger" selected="true">arger = max((aligned length / genome 1), (aligned_length / genome2))</option> 178 <expand macro="coverage_method"/>
163 <option value="total">total = 2*(aligned length) / (sum of total genome lengths)</option> 179 </when>
180 <when value="gANI"/>
181 <when value="goANI"/>
182 </conditional>
183 <param argument="--S_ani" type="float" value="0.99" min="0." max="1." label="ANI threshold to form secondary clusters"/>
184 <param argument="--cov_thresh" type="float" value="0.1" min="0." max="1." label="Minmum level of overlap between genomes when doing secondary comparisons"/>
185 </xml>
186 <xml name="test_default_secondary_clustering">
187 <conditional name="clustering">
188 <param name="S_algorithm" value="ANImf"/>
189 <expand macro="test_default_nucmer"/>
190 <expand macro="test_default_coverage_method"/>
191 </conditional>
192 <param name="S_ani" value="0.99"/>
193 <param name="cov_thresh" value="0.1"/>
194 </xml>
195 <token name="@SECONDARY_CLUSTERING@"><![CDATA[
196 --S_algorithm '$comp_clust.steps.clustering.S_algorithm'
197 #if $comp_clust.steps.clustering.S_algorithm == 'fastANI'
198 $comp_clust.steps.clustering.greedy_secondary_clustering
199 #else if $comp_clust.steps.clustering.S_algorithm == 'ANImf'
200 @NUCMER@
201 @COVERAGE_METHOD@
202 #else if $comp_clust.steps.clustering.S_algorithm == 'ANIn'
203 @NUCMER@
204 @COVERAGE_METHOD@
205 #end if
206 --S_ani $comp_clust.steps.S_ani
207 --cov_thresh $comp_clust.steps.cov_thresh
208 ]]></token>
209
210 <xml name="comparison_clustering_options">
211 <section name="comp_clust" title="Genome comparison and clustering" expanded="false">
212 <conditional name="steps">
213 <param name="select" type="select" label="Steps in genome comparison">
214 <option value="default" selected="true">Default: Run MASH clustering and a secondary clustering</option>
215 <option value="SkipMash">Skip MASH clustering, just do secondary clustering on all genomes</option>
216 <option value="SkipSecondary">Skip secondary clustering, just perform MASH clustering</option>
164 </param> 217 </param>
165 <param argument="--clusterAlg" type="select" label="Algorithm used to cluster genomes"> 218 <when value="default">
166 <help>(passed to scipy.cluster.hierarchy.linkage)</help> 219 <expand macro="mash"/>
167 <option value="average" selected="true">average</option> 220 <expand macro="secondary_clustering"/>
168 </param> 221 </when>
169 </when> 222 <when value="SkipMash">
170 <when value="no"/> 223 <expand macro="secondary_clustering"/>
171 </conditional> 224 </when>
172 </xml> 225 <when value="SkipSecondary">
173 <token name="@CLUSTERING_OPTIONS@"><![CDATA[ 226 <expand macro="mash"/>
174 #if $clustering.set_options == 'yes': 227 </when>
175 --P_ani $clustering.P_ani 228 </conditional>
176 --S_ani $clustering.S_ani 229 <param argument="--clusterAlg" type="select" label="Algorithm used to cluster genomes" help="Passed to scipy.cluster.hierarchy.linkage">
177 $clustering.SkipMash 230 <option value="average" selected="true">average</option>
178 $clustering.SkipSecondary 231 <option value="ward">ward</option>
179 --cov_thresh $clustering.cov_thresh 232 <option value="single">single</option>
180 --coverage_method $clustering.coverage_method 233 <option value="median">median</option>
181 --clusterAlg $clustering.clusterAlg 234 <option value="centroid">centroid</option>
182 #end if 235 <option value="weighted">weighted</option>
236 </param>
237 <param argument="--run_tertiary_clustering" type='boolean' checked="false" truevalue='--run_tertiary_clustering' falsevalue='' label="Run an additional round of clustering on the final genome set?" help="This is especially useful when greedy clustering is performed and/or to handle cases where similar genomes end up in different primary clusters."/>
238 </section>
239 </xml>
240 <xml name="test_default_comparison_clustering_options">
241 <section name="comp_clust">
242 <conditional name="steps">
243 <param name="select" value="default" />
244 <expand macro="test_default_mash"/>
245 <expand macro="test_default_secondary_clustering"/>
246 </conditional>
247 <param name="clusterAlg" value="average"/>
248 <param name="run_tertiary_clustering" value=''/>
249 </section>
250 </xml>
251 <token name="@COMPARISON_CLUSTERING_OPTIONS@"><![CDATA[
252 #if $comp_clust.steps.select == 'default'
253 @MASH@
254 @SECONDARY_CLUSTERING@
255 #else if $comp_clust.steps.select == 'SkipMash'
256 --SkipMash
257 @SECONDARY_CLUSTERING@
258 #else
259 @MASH@
260 --SkipSecondary
261 #end if
262 --clusterAlg '$comp_clust.clusterAlg'
263 $comp_clust.run_tertiary_clustering
183 ]]></token> 264 ]]></token>
184 265
185 <xml name="scoring_options"> 266 <xml name="scoring_options">
186 <conditional name="scoring"> 267 <section name="scoring" title="Scoring criteria" expanded="false" help="Based off of the formula: A*Completeness - B*Contamination + C*(Contamination * (strain_heterogeneity/100)) + D*log(N50) + E*log(size) + F*(centrality - S_ani). With A = completeness_weight; B = contamination_weight; C = strain_heterogeneity_weight; D = N50_weight; E = size_weight; F = cent_weight">
187 <param name="set_options" type="select" label="set scoring options"> 268 <param argument="--completeness_weight" type="float" value="1" label="Completeness weight"/>
188 <option value="yes">Yes</option> 269 <param argument="--contamination_weight" type="float" value="5" label="Contamination weight"/>
189 <option value="no" selected="true">No</option> 270 <param argument="--strain_heterogeneity_weight" type="float" value="1" min="0." max="1." label="Strain heterogeneity weight"/>
190 </param> 271 <param argument="--N50_weight" type="float" value=".5" label="Weight of log(genome N50)"/>
191 <when value="yes"> 272 <param argument="--size_weight" type="float" value="0" label="Weight of log(genome size)"/>
192 <param argument="--completeness_weight" type="float" value="1" label="completeness weight"> 273 <param argument="--centrality_weight" type="float" value="1" label="Weight of (centrality - S_ani)"/>
193 <help> 274 </section>
194 Based off of the formula: 275 </xml>
195 A*Completeness - B*Contamination + C*(Contamination * (strain_heterogeneity/100)) + D*log(N50) + E*log(size) 276 <xml name="test_default_scoring_options">
196 A = completeness_weight; B = contamination_weight; C = strain_heterogeneity_weight; D = N50_weight; E = size_weight; 277 <section name="scoring">
197 </help> 278 <param name="completeness_weight" value="1"/>
198 </param> 279 <param name="contamination_weight" value="5"/>
199 <param argument="--contamination_weight" type="float" value="5" label="contamination weight"/> 280 <param name="strain_heterogeneity_weight" value="1"/>
200 <param argument="--strain_heterogeneity_weight" type="float" value="1" min="0." max="1." label="strain heterogeneity weight"/> 281 <param name="N50_weight" value=".5" />
201 <param argument="--N50_weight" type="float" value=".5" label="weight of log(genome N50)"/> 282 <param name="size_weight" value="0"/>
202 <param argument="--size_weight" type="float" value="0" label="weight of log(genome size)"/> 283 <param name="centrality_weight" value="1"/>
203 </when> 284 </section>
204 <when value="no"/>
205 </conditional>
206 </xml> 285 </xml>
207 <token name="@SCORING_OPTIONS@"><![CDATA[ 286 <token name="@SCORING_OPTIONS@"><![CDATA[
208 #if $scoring.set_options == 'yes': 287 --completeness_weight $scoring.completeness_weight
209 --completeness_weight $scoring.completeness_weight 288 --contamination_weight $scoring.contamination_weight
210 --contamination_weight $scoring.contamination_weight 289 --strain_heterogeneity_weight $scoring.strain_heterogeneity_weight
211 --strain_heterogeneity_weight $scoring.strain_heterogeneity_weight 290 --N50_weight $scoring.N50_weight
212 --N50_weight $scoring.N50_weight 291 --size_weight $scoring.size_weight
213 --size_weight $scoring.size_weight 292 --centrality_weight $scoring.centrality_weight
214 #end if 293 ]]></token>
215 ]]></token> 294
216 295 <xml name="warning_options">
217 <xml name="taxonomy_options"> 296 <section name="warning" title="Warnings" expanded="false">
218 <conditional name="taxonomy"> 297 <param argument="--warn_dist" type="float" value="0.25" min="0" max="1" label="How far from the threshold to throw cluster warnings"/>
219 <param name="set_options" type="select" label="generate taxonomy information"> 298 <param argument="--warn_sim" type="float" value="0.98" min="0" max="1" label="Similarity threshold for warnings between dereplicated genomes"/>
220 <option value="yes">Yes</option> 299 <param argument="--warn_aln" type="float" value="0.25" min="0" max="1" label="Minimum aligned fraction for warnings between dereplicated genomes (ANIn)"/>
221 <option value="no" selected="true">No</option> 300 </section>
222 </param> 301 </xml>
223 <when value="yes"> 302 <xml name="test_default_warning_options">
224 <param argument="--tax_method" type="select" label="Method of determining taxonomy"> 303 <section name="warning">
225 <help>(for ANIn/ANImf only; gANI can only do larger method)</help> 304 <param name="warn_dist" value="0.25"/>
226 <option value="percent" selected="true">percent = The most descriptive taxonimic level with at least (per) hits</option> 305 <param name="warn_sim" value="0.98"/>
227 <option value="max">max = The centrifuge taxonomic level with the most overall hits</option> 306 <param name="warn_aln" value="0.25"/>
228 </param> 307 </section>
229 <param argument="--percent" type="float" value="50" min="0" max="100" label="minimum percent for percent method"/>
230 <param argument="--cent_index" type="data" format="" label="centrifuge index"/>
231 </when>
232 <when value="no"/>
233 </conditional>
234 </xml>
235 <token name="@TAXONOMY_OPTIONS@"><![CDATA[
236 #if $taxonomy.set_options == 'yes':
237 --run_tax
238 --tax_method $taxonomy.tax_method
239 --percent $taxonomy.percent
240 --cent_index $taxonomy.cent_index
241 #end if
242 ]]></token>
243
244 <xml name="warning_options">
245 <conditional name="warning">
246 <param name="set_options" type="select" label="set warning options">
247 <option value="yes">Yes</option>
248 <option value="no" selected="true">No</option>
249 </param>
250 <when value="yes">
251 <param argument="--warn_dist" type="float" value="0.25" min="0" max="1" label="How far from the threshold to throw cluster warnings"/>
252 <param argument="--warn_sim" type="float" value="0.98" min="0" max="1" label="Similarity threshold for warnings between dereplicated genomes"/>
253 <param argument="--warn_aln" type="float" value="0.25" min="0" max="1" label="Minimum aligned fraction for warnings between dereplicated genomes (ANIn)"/>
254 </when>
255 <when value="no"/>
256 </conditional>
257 </xml> 308 </xml>
258 <token name="@WARNING_OPTIONS@"><![CDATA[ 309 <token name="@WARNING_OPTIONS@"><![CDATA[
259 #if $warning.set_options == 'yes': 310 --warn_dist $warning.warn_dist
260 --warn_dist $warning.warn_dist 311 --warn_sim $warning.warn_sim
261 --warn_sim $warning.warn_sim 312 --warn_aln $warning.warn_aln
262 --warn_aln $warning.warn_aln
263 #end if
264 ]]></token> 313 ]]></token>
265 314
266 <xml name="select_outputs"> 315 <xml name="select_outputs">
267 <param name="select_outputs" type="select" multiple="true" optional="false" label="Select outputs"> 316 <param name="select_outputs" type="select" multiple="true" optional="false" label="Select outputs">
268 <option value="log" selected="true">log</option> 317 <option value="log" selected="true">log</option>
280 <option value="Winning_genomes">Winning_genomes.pdf</option> 329 <option value="Winning_genomes">Winning_genomes.pdf</option>
281 <option value="Widb">Widb.csv</option> 330 <option value="Widb">Widb.csv</option>
282 <option value="Chdb">Chdb.tsv</option> 331 <option value="Chdb">Chdb.tsv</option>
283 </expand> 332 </expand>
284 </xml> 333 </xml>
285 334 <xml name="test_default_select_drep_outputs">
286 <xml name="common_outputs"> 335 <param name="select_outputs" value="log,warnings,Primary_clustering_dendrogram,Clustering_scatterplots,Cluster_scoring,Winning_genomes,Widb" />
336 </xml>
337 <xml name="test_default_select_outputs">
338 <param name="select_outputs" value="log,warnings,Primary_clustering_dendrogram,Clustering_scatterplots" />
339 </xml>
340
341 <xml name="common_outputs">
287 <data name="log" format="txt" label="${tool.name} on ${on_string}: Log" from_work_dir="outdir/log/logger.log"> 342 <data name="log" format="txt" label="${tool.name} on ${on_string}: Log" from_work_dir="outdir/log/logger.log">
288 <filter>'log' in select_outputs or not select_outputs</filter> 343 <filter>'log' in select_outputs or not select_outputs</filter>
289 </data> 344 </data>
290 <data name="warnings" format="txt" label="${tool.name} on ${on_string}: Warnings" from_work_dir="outdir/log/warnings.txt"> 345 <data name="warnings" format="txt" label="${tool.name} on ${on_string}: Warnings" from_work_dir="outdir/log/warnings.txt">
291 <filter>'warnings' in select_outputs</filter> 346 <filter>'warnings' in select_outputs</filter>
301 </data> 356 </data>
302 <data name="Clustering_scatterplots" format="pdf" label="${tool.name} on ${on_string}: Clustering_scatterplots.pdf" from_work_dir="outdir/figures/Clustering_scatterplots.pdf"> 357 <data name="Clustering_scatterplots" format="pdf" label="${tool.name} on ${on_string}: Clustering_scatterplots.pdf" from_work_dir="outdir/figures/Clustering_scatterplots.pdf">
303 <filter>'Clustering_scatterplots' in select_outputs</filter> 358 <filter>'Clustering_scatterplots' in select_outputs</filter>
304 </data> 359 </data>
305 </xml> 360 </xml>
306
307
308 <xml name="drep_outputs"> 361 <xml name="drep_outputs">
309 <expand macro="common_outputs"/> 362 <expand macro="common_outputs"/>
310 <data name="Cluster_scoring" format="pdf" label="${tool.name} on ${on_string}: Cluster_scoring.pdf" from_work_dir="outdir/figures/Cluster_scoring.pdf"> 363 <data name="Cluster_scoring" format="pdf" label="${tool.name} on ${on_string}: Cluster_scoring.pdf" from_work_dir="outdir/figures/Cluster_scoring.pdf">
311 <filter>'Cluster_scoring' in select_outputs</filter> 364 <filter>'Cluster_scoring' in select_outputs</filter>
312 </data> 365 </data>
318 </data> 371 </data>
319 <data name="Chdb" format="tabular" label="${tool.name} on ${on_string}: Chdb.tsv" from_work_dir="outdir/data/checkM/checkM_outdir/Chdb.tsv"> 372 <data name="Chdb" format="tabular" label="${tool.name} on ${on_string}: Chdb.tsv" from_work_dir="outdir/data/checkM/checkM_outdir/Chdb.tsv">
320 <filter>'Chdb' in select_outputs</filter> 373 <filter>'Chdb' in select_outputs</filter>
321 </data> 374 </data>
322 </xml> 375 </xml>
323 376 <xml name="test_string_inputs">
324 377 <param name="genomes" ftype="fasta" value="Enterococcus_casseliflavus_EC20.fasta,Enterococcus_faecalis_T2.fna,Enterococcus_faecalis_TX0104.fa"/>
325 <xml name="test_defaults_log"> 378 </xml>
326 <test> 379 <xml name="test_integer_inputs">
327 <param name="genomes" ftype="fasta" value="Enterococcus_casseliflavus_EC20.fasta,Enterococcus_faecalis_T2.fna,Enterococcus_faecalis_TX0104.fa"/> 380 <param name="genomes" ftype="fasta" value="001,002,003"/>
328 <output name="log"> 381 </xml>
329 <assert_contents> 382 <xml name="test_log_output">
330 <yield/> 383 <output name="log">
331 </assert_contents> 384 <assert_contents>
332 </output> 385 <yield/>
333 </test> 386 </assert_contents>
334 </xml> 387 </output>
335 388 </xml>
336 <token name="@GENOMES_HELP@"><![CDATA[ 389 <token name="@GENOMES_HELP@"><![CDATA[
337 I/O PARAMETERS: 390 I/O PARAMETERS:
338 -g [GENOMES [GENOMES ...]], --genomes [GENOMES [GENOMES ...]] 391 -g [GENOMES [GENOMES ...]], --genomes [GENOMES [GENOMES ...]]
339 genomes to cluster in .fasta format 392 genomes to cluster in .fasta format
340 (default: None) 393 (default: None)
341 394
342 395
343 ]]></token> 396 ]]></token>
344
345 <token name="@FILTERING_HELP@"><![CDATA[ 397 <token name="@FILTERING_HELP@"><![CDATA[
346 FILTERING OPTIONS: 398 FILTERING OPTIONS:
347 -l LENGTH, --length LENGTH 399 -l LENGTH, --length LENGTH
348 Minimum genome length 400 Minimum genome length
349 (default: 50000) 401 (default: 50000)
366 scoring does not work. Will only choose genomes based 418 scoring does not work. Will only choose genomes based
367 on length and N50 (default: False) 419 on length and N50 (default: False)
368 420
369 421
370 ]]></token> 422 ]]></token>
371
372 <token name="@GENOME_COMPARISON_HELP@"><![CDATA[ 423 <token name="@GENOME_COMPARISON_HELP@"><![CDATA[
373 GENOME COMPARISON PARAMETERS: 424 GENOME COMPARISON PARAMETERS:
374 -ms MASH_SKETCH, --MASH_sketch MASH_SKETCH 425 -ms MASH_SKETCH, --MASH_sketch MASH_SKETCH
375 MASH sketch size (default: 1000) 426 MASH sketch size (default: 1000)
376 427
385 Presets to pass to nucmer 436 Presets to pass to nucmer
386 tight = only align highly conserved regions 437 tight = only align highly conserved regions
387 normal = default ANIn parameters (default: normal) 438 normal = default ANIn parameters (default: normal)
388 439
389 ]]></token> 440 ]]></token>
390
391 <token name="@CLUSTERING_HELP@"><![CDATA[ 441 <token name="@CLUSTERING_HELP@"><![CDATA[
392 CLUSTERING PARAMETERS: 442 CLUSTERING PARAMETERS:
393 -pa P_ANI, --P_ani P_ANI 443 -pa P_ANI, --P_ani P_ANI
394 ANI threshold to form primary (MASH) clusters 444 ANI threshold to form primary (MASH) clusters
395 (default: 0.9) 445 (default: 0.9)
415 --clusterAlg CLUSTERALG 465 --clusterAlg CLUSTERALG
416 Algorithm used to cluster genomes (passed to 466 Algorithm used to cluster genomes (passed to
417 scipy.cluster.hierarchy.linkage (default: average) 467 scipy.cluster.hierarchy.linkage (default: average)
418 468
419 ]]></token> 469 ]]></token>
420
421 <token name="@SCORING_HELP@"><![CDATA[ 470 <token name="@SCORING_HELP@"><![CDATA[
422 SCORING CRITERIA 471 SCORING CRITERIA
423 Based off of the formula: 472 Based off of the formula:
424 A*Completeness - B*Contamination + C*(Contamination * (strain_heterogeneity/100)) + D*log(N50) + E*log(size) 473 A*Completeness - B*Contamination + C*(Contamination * (strain_heterogeneity/100)) + D*log(N50) + E*log(size)
425 474
426 A = completeness_weight; B = contamination_weight; C = strain_heterogeneity_weight; D = N50_weight; E = size_weight: 475 A = completeness_weight; B = contamination_weight; C = strain_heterogeneity_weight; D = N50_weight; E = size_weight:
427 -comW COMPLETENESS_WEIGHT, --completeness_weight COMPLETENESS_WEIGHT 476 -comW COMPLETENESS_WEIGHT, --completeness_weight COMPLETENESS_WEIGHT
428 completeness weight (default: 1) 477 completeness weight (default: 1)
435 -sizeW SIZE_WEIGHT, --size_weight SIZE_WEIGHT 484 -sizeW SIZE_WEIGHT, --size_weight SIZE_WEIGHT
436 weight of log(genome size) (default: 0) 485 weight of log(genome size) (default: 0)
437 486
438 487
439 ]]></token> 488 ]]></token>
440
441 <token name="@TAXONOMY_HELP@"><![CDATA[ 489 <token name="@TAXONOMY_HELP@"><![CDATA[
442 TAXONOMY: 490 TAXONOMY:
443 --run_tax generate taxonomy information (Tdb) 491 --run_tax generate taxonomy information (Tdb)
444 (default: False) 492 (default: False)
445 493
459 path to centrifuge index (for example, 507 path to centrifuge index (for example,
460 /home/mattolm/download/centrifuge/indices/b+h+v 508 /home/mattolm/download/centrifuge/indices/b+h+v
461 (default: None) 509 (default: None)
462 510
463 ]]></token> 511 ]]></token>
464
465 <token name="@WARNINGS_HELP@"><![CDATA[ 512 <token name="@WARNINGS_HELP@"><![CDATA[
466 WARNINGS: 513 WARNINGS:
467 --warn_dist WARN_DIST 514 --warn_dist WARN_DIST
468 How far from the threshold to throw cluster warnings 515 How far from the threshold to throw cluster warnings
469 (default: 0.25) 516 (default: 0.25)
471 genomes (default: 0.98) 518 genomes (default: 0.98)
472 --warn_aln WARN_ALN Minimum aligned fraction for warnings between 519 --warn_aln WARN_ALN Minimum aligned fraction for warnings between
473 dereplicated genomes (ANIn) (default: 0.25) 520 dereplicated genomes (ANIn) (default: 0.25)
474 521
475 ]]></token> 522 ]]></token>
476
477
478 </macros> 523 </macros>