Mercurial > repos > bgruening > numeric_clustering
comparison numeric_clustering.xml @ 0:42a2825313e9 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/numeric_clustering commit ac9beb7e9c83b0ae811b304eb3085a4b0930f5a0
author | bgruening |
---|---|
date | Fri, 01 Jan 2016 10:24:53 -0500 |
parents | |
children | d645cdee08ed |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:42a2825313e9 |
---|---|
1 <tool id="numeric_clustering" name="Numeric Clustering" version="@VERSION@"> | |
2 <description></description> | |
3 <requirements> | |
4 <requirement type="package" version="2.3.0">anaconda</requirement> | |
5 </requirements> | |
6 <stdio> | |
7 <exit_code level="fatal" range="1:"/> | |
8 </stdio> | |
9 <macros> | |
10 <token name="@VERSION@">0.9</token> | |
11 <macro name="n_clusters"> | |
12 <param name="n_clusters" type="integer" optional="true" value="8" label="Number of clusters" | |
13 help="default value is 8 (--n_clusters)"/> | |
14 </macro> | |
15 <macro name="n_init"> | |
16 <param name="n_init" type="integer" optional="true" value="" label="Number of runs with different centroid seeds"/> | |
17 </macro> | |
18 <macro name="max_iter"> | |
19 <param name="max_iter" type="integer" optional="true" value="" label="Maximum number of iterations per single run"/> | |
20 </macro> | |
21 <macro name="random_state"> | |
22 <param name="random_state" type="integer" optional="true" value="" label="Initialize centers"/> | |
23 </macro> | |
24 <macro name="affinity"> | |
25 <param name="affinity" type="text" optional="true" value="" label="Affinity"/> | |
26 </macro> | |
27 <macro name="tol"> | |
28 <param name="tol" type="float" optional="true" value="" label="Relative tolerance"/> | |
29 </macro> | |
30 <macro name="init"> | |
31 <param name="init" type="select" label="Select initialization method"> | |
32 <option value="k-means++">k-means++</option> | |
33 <option value="random">random</option> | |
34 </param> | |
35 </macro> | |
36 </macros> | |
37 <version_command>echo "@VERSION@"</version_command> | |
38 <command><![CDATA[ | |
39 #import json | |
40 #set $params = dict() | |
41 #for $key, $value in $algorithm_options.items(): | |
42 #if not $key.startswith('__') and $key.strip() != 'selected_algorithm' and str($value).strip(): | |
43 #if str($value).strip() == 'false': | |
44 #set $value = False | |
45 #elif str($value).strip() == 'true': | |
46 #set $value = True | |
47 #else: | |
48 #try: | |
49 #set $val = float($value) | |
50 #try: | |
51 #set $value = int($value) | |
52 #except: | |
53 #set $value = float($value) | |
54 #end try | |
55 #except: | |
56 #set $value = str($value) | |
57 #end try | |
58 #end if | |
59 $params.update({str($key): $value}) | |
60 #end if | |
61 #end for | |
62 #set $json_string = json.dumps( $params ) | |
63 | |
64 python "$cluster_script" '$json_string' | |
65 ]]> | |
66 </command> | |
67 <configfiles> | |
68 <configfile name="cluster_script"> | |
69 <![CDATA[#!/usr/bin/env python | |
70 import sys | |
71 import json | |
72 import numpy as np | |
73 import sklearn.cluster | |
74 import pandas | |
75 | |
76 data = pandas.DataFrame.from_csv("$infile", sep='\t', header=0, index_col=0, parse_dates=True, encoding=None, tupleize_cols=False ) | |
77 my_class = getattr(sklearn.cluster, "$algorithm_options.selected_algorithm") | |
78 cluster_object = my_class() | |
79 | |
80 params = json.loads( sys.argv[1] ) | |
81 cluster_object.set_params(**params) | |
82 if $end_column > $start_column: | |
83 data_matrix = data.values[:, $start_column-1:$end_column] | |
84 else: | |
85 data_matrix = data.values | |
86 prediction = cluster_object.fit_predict( data_matrix ) | |
87 data['cluster_label'] = prediction | |
88 data.to_csv(path_or_buf = "$outfile",sep="\t") | |
89 ]]> | |
90 </configfile> | |
91 </configfiles> | |
92 <inputs> | |
93 <param name="infile" type="data" format="tabular" label="Data file with numeric values"/> | |
94 <param name="start_column" label="Clustering column from" type="data_column" data_ref="infile" optional="True" /> | |
95 <param name="end_column" label="to" type="data_column" data_ref="infile" optional="True" /> | |
96 <conditional name="algorithm_options"> | |
97 <param name="selected_algorithm" type="select" label="Clustering Algorithm"> | |
98 <option value="KMeans">KMeans</option> | |
99 <option value="DBSCAN">DBSCAN</option> | |
100 <option value="Birch">Birch</option> | |
101 <option value="MeanShift">MeanShift</option> | |
102 <option value="AffinityPropagation">Affinity Propagation</option> | |
103 <option value="AgglomerativeClustering">Agglomerative Clustering</option> | |
104 <option value="SpectralClustering">Spectral Clustering</option> | |
105 <option value="MiniBatchKMeans">Mini Batch KMeans</option> | |
106 </param> | |
107 <when value="KMeans"> | |
108 <expand macro="n_clusters"/> | |
109 <expand macro="init"/> | |
110 <expand macro="n_init"/> | |
111 <expand macro="max_iter"/> | |
112 <expand macro="tol"/> | |
113 <param name="precompute_distances" type="text" optional="true" value="" label="Precompute distances"/> | |
114 <expand macro="random_state"/> | |
115 <param name="copy_x" type="boolean" optional="true" truevalue="--copy_x" falsevale="" label="Do not modify original data"/> | |
116 </when> | |
117 <when value="DBSCAN"> | |
118 <param name="eps" type="float" optional="true" value="0.5" label="Maximum neghborhood distance"/> | |
119 <param name="min_samples" type="integer" optional="true" value="5" label="Core point minimum population"/> | |
120 <param name="metric" type="text" optional="true" value="euclidean" label="Metric"/> | |
121 <param name="algorithm" type="select" optional="true" value="auto" label="Pointwise distance algorithm"> | |
122 <option value="auto">auto</option> | |
123 <option value="ball_tree">ball_tree</option> | |
124 <option value="kd_tree">kd_tree</option> | |
125 <option value="brute">brute</option> | |
126 </param> | |
127 <param name="leaf_size" type="integer" optional="true" value="30" label="Leaf size"/> | |
128 </when> | |
129 <when value="Birch"> | |
130 <param name="threshold" type="float" optional="true" value="0.5" label="Subcluster radius threshold"/> | |
131 <param name="branching_factor" type="integer" optional="true" value="50" label="Maximum number of subclusters per branch"/> | |
132 <expand macro="n_clusters"/> <!-- default to 3--> | |
133 <!--param name="compute_labels" type="boolean" optional="true" truevalue="true" falsevale="false" label="Compute labels for each fit"/--> | |
134 </when> | |
135 <when value="AffinityPropagation"> | |
136 <param name="damping" type="float" optional="true" value="0.5" label="Damping factor"/> | |
137 <expand macro="max_iter"/> <!--default to 200 --> | |
138 <param name="convergence_iter" type="integer" optional="true" value="15" label="Number of iterations at each convergence step"/> | |
139 <param name="copy" type="boolean" optional="true" truevalue="true" falsevale="false" label="Make a copy of input data"/> | |
140 <!--param name="preference" type="text" optional="true" value="None" label="Array like shape (n_samples,)"/--> | |
141 <expand macro="affinity"/> <!--default = euclidean--> | |
142 </when> | |
143 <when value="MeanShift"> | |
144 <param name="bandwidth" type="float" optional="true" value="" label="RBF kernel bandwidth"/> | |
145 <!--param name="seeds" type="list" optional="true" value="None" label=""/--> | |
146 <param name="bin_seeding" type="boolean" optional="true" truevalue="true" falsevale="false" label="Discretize initial kernel locations"/> | |
147 <param name="min_bin_freq" type="integer" optional="true" value="1" label="Minimum number of seeds per bin"/> | |
148 <param name="cluster_all" type="boolean" optional="true" truevalue="true" falsevale="false" label="Cluster all"/> | |
149 </when> | |
150 <when value="AgglomerativeClustering"> | |
151 <expand macro="n_clusters"/> <!-- deafault 2--> | |
152 <expand macro="affinity"/> <!--default = euclidean--> | |
153 <!--param name="memory" type="callable" optional="true" value="Memory(cachedir=None)" label="Caching path"/--> | |
154 <!--param name="connectivity" type="list array-like or callable" optional="true" value="None" label="Connectivity matrix"/--> | |
155 <param name="n_components" type="integer" optional="true" value="" label="Number of connected components"/> | |
156 <!--param name="compute_full_tree" type="text or boolean" optional="true" value="auto" label=""/--> | |
157 <param name="linkage" type="select" optional="true" value="ward" label="Linkage"> | |
158 <option value="ward">ward</option> | |
159 <option value="complete">complete</option> | |
160 <option value="average">average</option> | |
161 </param> | |
162 <!--param name="pooling_func" type="callable" optional="np.mean" value="None" label=""/--> | |
163 </when> | |
164 <when value="SpectralClustering"> | |
165 <expand macro="n_clusters"/> | |
166 <param name="eigen_solver" type="select" value="arpack" label="Eigenvalue decomposition strategy"> | |
167 <option value="arpack">arpack</option> | |
168 <option value="lobpcg">lobpcg</option> | |
169 <option value="amg">amg</option> | |
170 </param> | |
171 <expand macro="random_state"/> | |
172 <!-- Todo: extend random_state type to int seed, RandomState instance, or None. --> | |
173 <expand macro="n_init"/> <!-- default to 10--> | |
174 <param name="gamma" type="float" optional="true" value="1.0" label="Kernel scaling factor"/> | |
175 <expand macro="affinity"/> <!--default =rbf--> | |
176 <param name="n_neighbors" type="integer" optional="true" value="10" label="Number of neighbors"/> | |
177 <!--param name="eigen_tol" type="float" optional="true" value="0.0" label="arpack eigendecomposition stopping threshold"/--> | |
178 <param name="assign_labels" type="select" optional="true" value="kmeans" label="Assign labels"> | |
179 <option value="kmeans">kmeans</option> | |
180 <option value="discretize">discretize</option> | |
181 </param> | |
182 <param name="degree" type="integer" optional="true" value="3" label="Degree of the polynomial (polynomial kernel only)"/> | |
183 <param name="coef0" type="integer" optional="true" value="1" label="Zero coefficient (polynomial and sigmoid kernels only)"/> | |
184 <!--param name="kernel_params" type="dict" optional="true" value="None" label=""/--> | |
185 </when> | |
186 <when value="MiniBatchKMeans"> | |
187 <expand macro="n_clusters"/> | |
188 <expand macro="init"/> | |
189 <expand macro="n_init"/> <!-- default to 3--> | |
190 <expand macro="max_iter"/> <!--default to 100--> | |
191 <expand macro="tol"/> <!--default = 0.0--> | |
192 <expand macro="random_state"/> | |
193 <param name="batch_size" type="integer" optional="true" value="100" label="Mini batch size"/> | |
194 <!--param name="compute_labels" type="boolean" optional="true" truevalue="true" falsevale="false" label="Compute labels for all data"/--> | |
195 <param name="max_no_improvement" type="integer" optional="true" value="10" label="Maximum number of improvement attempts"/> | |
196 <param name="init_size" type="integer" optional="true" value="" label="Number of random init samples"/> | |
197 <param name="reassignment_ratio" type="float" optional="true" value="0.01" label="Re-assignment ratio"/> | |
198 </when> | |
199 </conditional> | |
200 </inputs> | |
201 <outputs> | |
202 <data format_source="infile" name="outfile"/> | |
203 </outputs> | |
204 <tests> | |
205 <test> | |
206 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | |
207 <param name="selected_algorithm" value="KMeans"/> | |
208 <param name="n_clusters" value="4" /> | |
209 <param name="init" value="k-means++" /> | |
210 <param name="random_state" value="100"/> | |
211 <output name="outfile" file="cluster_result01.txt"/> | |
212 </test> | |
213 <test> | |
214 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | |
215 <param name="selected_algorithm" value="KMeans"/> | |
216 <param name="n_clusters" value="6" /> | |
217 <param name="init" value="random" /> | |
218 <param name="random_state" value="100"/> | |
219 <output name="outfile" file="cluster_result02.txt"/> | |
220 </test> | |
221 <test> | |
222 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | |
223 <param name="selected_algorithm" value="DBSCAN"/> | |
224 <param name="algorithm" value="kd_tree"/> | |
225 <param name="leaf_size" value="10"/> | |
226 <param name="eps" value="1.0"/> | |
227 <output name="outfile" file="cluster_result03.txt"/> | |
228 </test> | |
229 <test> | |
230 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | |
231 <param name="selected_algorithm" value="Birch"/> | |
232 <param name="n_clusters" value="5"/> | |
233 <param name="threshold" value="0.008"/> | |
234 <output name="outfile" file="cluster_result04.txt"/> | |
235 </test> | |
236 <test> | |
237 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | |
238 <param name="selected_algorithm" value="Birch"/> | |
239 <param name="branching_factor" value="20"/> | |
240 <output name="outfile" file="cluster_result05.txt"/> | |
241 </test> | |
242 <test> | |
243 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | |
244 <param name="selected_algorithm" value="AffinityPropagation"/> | |
245 <param name="affinity" value="euclidean"/> | |
246 <param name="copy" value="false"/> | |
247 <output name="outfile" file="cluster_result06.txt"/> | |
248 </test> | |
249 <test> | |
250 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | |
251 <param name="selected_algorithm" value="AffinityPropagation"/> | |
252 <param name="damping" value="0.8"/> | |
253 <output name="outfile" file="cluster_result07.txt"/> | |
254 </test> | |
255 <test> | |
256 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | |
257 <param name="selected_algorithm" value="MeanShift"/> | |
258 <param name="min_bin_freq" value="3"/> | |
259 <output name="outfile" file="cluster_result08.txt"/> | |
260 </test> | |
261 <test> | |
262 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | |
263 <param name="selected_algorithm" value="MeanShift"/> | |
264 <param name="cluster_all" value="False"/> | |
265 <output name="outfile" file="cluster_result09.txt"/> | |
266 </test> | |
267 <test> | |
268 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | |
269 <param name="selected_algorithm" value="AgglomerativeClustering"/> | |
270 <param name="affinity" value="euclidean"/> | |
271 <param name="linkage" value="average"/> | |
272 <output name="outfile" file="cluster_result10.txt"/> | |
273 </test> | |
274 <test> | |
275 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | |
276 <param name="selected_algorithm" value="AgglomerativeClustering"/> | |
277 <param name="linkage" value="complete"/> | |
278 <param name="n_clusters" value="5"/> | |
279 <output name="outfile" file="cluster_result11.txt"/> | |
280 </test> | |
281 <test> | |
282 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | |
283 <param name="selected_algorithm" value="SpectralClustering"/> | |
284 <param name="eigen_solver" value="arpack"/> | |
285 <param name="n_neighbors" value="12"/> | |
286 <param name="n_clusters" value="7"/> | |
287 <param name="assign_labels" value="discretize"/> | |
288 <param name="random_state" value="100"/> | |
289 <output name="outfile" file="cluster_result12.txt"/> | |
290 </test> | |
291 <test> | |
292 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | |
293 <param name="selected_algorithm" value="SpectralClustering"/> | |
294 <param name="assign_labels" value="discretize"/> | |
295 <param name="random_state" value="100"/> | |
296 <param name="degree" value="2"/> | |
297 <output name="outfile" file="cluster_result13.txt"/> | |
298 </test> | |
299 <test> | |
300 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | |
301 <param name="selected_algorithm" value="MiniBatchKMeans"/> | |
302 <param name="tol" value="0.5"/> | |
303 <param name="random_state" value="100"/> | |
304 <output name="outfile" file="cluster_result14.txt"/> | |
305 </test> | |
306 <test> | |
307 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | |
308 <param name="selected_algorithm" value="MiniBatchKMeans"/> | |
309 <param name="n_init" value="5"/> | |
310 <param name="batch_size" value="10"/> | |
311 <param name="n_clusters" value="3"/> | |
312 <param name="random_state" value="100"/> | |
313 <param name="reassignment_ratio" value="1.0"/> | |
314 <output name="outfile" file="cluster_result15.txt"/> | |
315 </test> | |
316 <test> | |
317 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | |
318 <param name="selected_algorithm" value="KMeans"/> | |
319 <param name="start_column" value="3" /> | |
320 <param name="end_column" value="4" /> | |
321 <param name="n_clusters" value="6" /> | |
322 <param name="init" value="random" /> | |
323 <param name="random_state" value="100"/> | |
324 <output name="outfile" file="cluster_result16.txt"/> | |
325 </test> | |
326 </tests> | |
327 <help><![CDATA[ | |
328 **What it does** | |
329 | |
330 This clustering tool offers different clustering algorithms which are provided by | |
331 scikit-learn to find similarities among samples and cluster the samples based on these similarities. | |
332 | |
333 ]]></help> | |
334 <citations> | |
335 <citation type="bibtex"> | |
336 @article{scikit-learn, | |
337 title={Scikit-learn: Machine Learning in {P}ython}, | |
338 author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V. | |
339 and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P. | |
340 and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and | |
341 Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.}, | |
342 journal={Journal of Machine Learning Research}, | |
343 volume={12}, | |
344 pages={2825--2830}, | |
345 year={2011} | |
346 url = {https://github.com/scikit-learn/scikit-learn} | |
347 } | |
348 </citation> | |
349 </citations> | |
350 </tool> |