comparison numeric_clustering.xml @ 2:1d465f2ebfad draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/numeric_clustering commit adf077b912ddebd97b07b947b855cdd2862ed8ef
author bgruening
date Fri, 01 Jan 2016 12:58:17 -0500
parents d645cdee08ed
children 6bfbaf81b8f4
comparison
equal deleted inserted replaced
1:d645cdee08ed 2:1d465f2ebfad
6 <stdio> 6 <stdio>
7 <exit_code level="fatal" range="1:"/> 7 <exit_code level="fatal" range="1:"/>
8 </stdio> 8 </stdio>
9 <macros> 9 <macros>
10 <token name="@VERSION@">0.9</token> 10 <token name="@VERSION@">0.9</token>
11 <macro name="n_clusters"> 11 <macro name="n_clusters" token_default_value="8">
12 <param name="n_clusters" type="integer" optional="true" value="8" label="Number of clusters" 12 <param name="n_clusters" type="integer" optional="true" value="@DEFAULT_VALUE@" label="Number of clusters"
13 help="default value is 8 (--n_clusters)"/> 13 help="default value is @DEFAULT_VALUE@ (--n_clusters)"/>
14 </macro> 14 </macro>
15 <macro name="n_init"> 15 <macro name="n_init">
16 <param name="n_init" type="integer" optional="true" value="" label="Number of runs with different centroid seeds"/> 16 <param name="n_init" type="integer" optional="true" value="" label="Number of runs with different centroid seeds"/>
17 </macro> 17 </macro>
18 <macro name="max_iter"> 18 <macro name="max_iter">
60 #end if 60 #end if
61 #end for 61 #end for
62 #set $json_string = json.dumps( $params ) 62 #set $json_string = json.dumps( $params )
63 63
64 python "$cluster_script" '$json_string' 64 python "$cluster_script" '$json_string'
65 &&
66 cat "$cluster_script" >&2
67
65 ]]> 68 ]]>
66 </command> 69 </command>
67 <configfiles> 70 <configfiles>
68 <configfile name="cluster_script"> 71 <configfile name="cluster_script">
69 <![CDATA[#!/usr/bin/env python 72 <![CDATA[
70 import sys 73 import sys
71 import json 74 import json
72 import numpy as np 75 import numpy as np
73 import sklearn.cluster 76 import sklearn.cluster
74 import pandas 77 import pandas
75 78
76 data = pandas.DataFrame.from_csv("$infile", sep='\t', header=0, index_col=0, parse_dates=True, encoding=None, tupleize_cols=False ) 79 data = pandas.read_csv("$infile", sep='\t', header=0, index_col=0, parse_dates=True, encoding=None, tupleize_cols=False )
77 my_class = getattr(sklearn.cluster, "$algorithm_options.selected_algorithm") 80 my_class = getattr(sklearn.cluster, "$algorithm_options.selected_algorithm")
78 cluster_object = my_class() 81 cluster_object = my_class()
79 82
80 params = json.loads( sys.argv[1] ) 83 params = json.loads( sys.argv[1] )
81 cluster_object.set_params(**params) 84 cluster_object.set_params(**params)
82 if $end_column >= $start_column: 85 #if $end_column and $start_column:
86
87 if $end_column >= $start_column:
83 data_matrix = data.values[:, $start_column-1:$end_column] 88 data_matrix = data.values[:, $start_column-1:$end_column]
84 else: 89 else:
85 data_matrix = data.values 90 data_matrix = data.values
91
92 #else:
93 data_matrix = data.values
94 #end if
86 prediction = cluster_object.fit_predict( data_matrix ) 95 prediction = cluster_object.fit_predict( data_matrix )
87 data['cluster_label'] = prediction 96 data[len(data.columns)] = prediction
88 data.to_csv(path_or_buf = "$outfile",sep="\t") 97 data.to_csv(path_or_buf = "$outfile", sep="\t")
89 ]]> 98 ]]>
90 </configfile> 99 </configfile>
91 </configfiles> 100 </configfiles>
92 <inputs> 101 <inputs>
93 <param name="infile" type="data" format="tabular" label="Data file with numeric values"/> 102 <param name="infile" type="data" format="tabular" label="Data file with numeric values" />
94 <param name="start_column" label="Clustering column from" type="data_column" data_ref="infile" optional="True" /> 103 <param name="start_column" type="data_column" data_ref="infile" optional="True" label="Clustering column from" />
95 <param name="end_column" label="to" type="data_column" data_ref="infile" optional="True" /> 104 <param name="end_column" type="data_column" data_ref="infile" optional="True" label="to" />
96 <conditional name="algorithm_options"> 105 <conditional name="algorithm_options">
97 <param name="selected_algorithm" type="select" label="Clustering Algorithm"> 106 <param name="selected_algorithm" type="select" label="Clustering Algorithm">
98 <option value="KMeans">KMeans</option> 107 <option value="KMeans">KMeans</option>
99 <option value="DBSCAN">DBSCAN</option> 108 <option value="DBSCAN">DBSCAN</option>
100 <option value="Birch">Birch</option> 109 <option value="Birch">Birch</option>
103 <option value="AgglomerativeClustering">Agglomerative Clustering</option> 112 <option value="AgglomerativeClustering">Agglomerative Clustering</option>
104 <option value="SpectralClustering">Spectral Clustering</option> 113 <option value="SpectralClustering">Spectral Clustering</option>
105 <option value="MiniBatchKMeans">Mini Batch KMeans</option> 114 <option value="MiniBatchKMeans">Mini Batch KMeans</option>
106 </param> 115 </param>
107 <when value="KMeans"> 116 <when value="KMeans">
108 <expand macro="n_clusters"/> 117 <expand macro="n_clusters" default_label="8"/>
109 <expand macro="init"/> 118 <expand macro="init"/>
110 <expand macro="n_init"/> 119 <expand macro="n_init"/>
111 <expand macro="max_iter"/> 120 <expand macro="max_iter"/>
112 <expand macro="tol"/> 121 <expand macro="tol"/>
113 <param name="precompute_distances" type="text" optional="true" value="" label="Precompute distances"/> 122 <param name="precompute_distances" type="text" optional="true" value="" label="Precompute distances"/>
127 <param name="leaf_size" type="integer" optional="true" value="30" label="Leaf size"/> 136 <param name="leaf_size" type="integer" optional="true" value="30" label="Leaf size"/>
128 </when> 137 </when>
129 <when value="Birch"> 138 <when value="Birch">
130 <param name="threshold" type="float" optional="true" value="0.5" label="Subcluster radius threshold"/> 139 <param name="threshold" type="float" optional="true" value="0.5" label="Subcluster radius threshold"/>
131 <param name="branching_factor" type="integer" optional="true" value="50" label="Maximum number of subclusters per branch"/> 140 <param name="branching_factor" type="integer" optional="true" value="50" label="Maximum number of subclusters per branch"/>
132 <expand macro="n_clusters"/> <!-- default to 3--> 141 <expand macro="n_clusters" default_label="3" /> <!-- default to 3-->
133 <!--param name="compute_labels" type="boolean" optional="true" truevalue="true" falsevale="false" label="Compute labels for each fit"/--> 142 <!--param name="compute_labels" type="boolean" optional="true" truevalue="true" falsevale="false" label="Compute labels for each fit"/-->
134 </when> 143 </when>
135 <when value="AffinityPropagation"> 144 <when value="AffinityPropagation">
136 <param name="damping" type="float" optional="true" value="0.5" label="Damping factor"/> 145 <param name="damping" type="float" optional="true" value="0.5" label="Damping factor"/>
137 <expand macro="max_iter"/> <!--default to 200 --> 146 <expand macro="max_iter"/> <!--default to 200 -->
146 <param name="bin_seeding" type="boolean" optional="true" truevalue="true" falsevale="false" label="Discretize initial kernel locations"/> 155 <param name="bin_seeding" type="boolean" optional="true" truevalue="true" falsevale="false" label="Discretize initial kernel locations"/>
147 <param name="min_bin_freq" type="integer" optional="true" value="1" label="Minimum number of seeds per bin"/> 156 <param name="min_bin_freq" type="integer" optional="true" value="1" label="Minimum number of seeds per bin"/>
148 <param name="cluster_all" type="boolean" optional="true" truevalue="true" falsevale="false" label="Cluster all"/> 157 <param name="cluster_all" type="boolean" optional="true" truevalue="true" falsevale="false" label="Cluster all"/>
149 </when> 158 </when>
150 <when value="AgglomerativeClustering"> 159 <when value="AgglomerativeClustering">
151 <expand macro="n_clusters"/> <!-- deafault 2--> 160 <expand macro="n_clusters" default_label="2" /> <!-- deafault 2-->
152 <expand macro="affinity"/> <!--default = euclidean--> 161 <expand macro="affinity"/> <!--default = euclidean-->
153 <!--param name="memory" type="callable" optional="true" value="Memory(cachedir=None)" label="Caching path"/--> 162 <!--param name="memory" type="callable" optional="true" value="Memory(cachedir=None)" label="Caching path"/-->
154 <!--param name="connectivity" type="list array-like or callable" optional="true" value="None" label="Connectivity matrix"/--> 163 <!--param name="connectivity" type="list array-like or callable" optional="true" value="None" label="Connectivity matrix"/-->
155 <param name="n_components" type="integer" optional="true" value="" label="Number of connected components"/> 164 <param name="n_components" type="integer" optional="true" value="" label="Number of connected components"/>
156 <!--param name="compute_full_tree" type="text or boolean" optional="true" value="auto" label=""/--> 165 <!--param name="compute_full_tree" type="text or boolean" optional="true" value="auto" label=""/-->
160 <option value="average">average</option> 169 <option value="average">average</option>
161 </param> 170 </param>
162 <!--param name="pooling_func" type="callable" optional="np.mean" value="None" label=""/--> 171 <!--param name="pooling_func" type="callable" optional="np.mean" value="None" label=""/-->
163 </when> 172 </when>
164 <when value="SpectralClustering"> 173 <when value="SpectralClustering">
165 <expand macro="n_clusters"/> 174 <expand macro="n_clusters" default_label="8" />
166 <param name="eigen_solver" type="select" value="arpack" label="Eigenvalue decomposition strategy"> 175 <param name="eigen_solver" type="select" value="arpack" label="Eigenvalue decomposition strategy">
167 <option value="arpack">arpack</option> 176 <option value="arpack">arpack</option>
168 <option value="lobpcg">lobpcg</option> 177 <option value="lobpcg">lobpcg</option>
169 <option value="amg">amg</option> 178 <option value="amg">amg</option>
170 </param> 179 </param>
182 <param name="degree" type="integer" optional="true" value="3" label="Degree of the polynomial (polynomial kernel only)"/> 191 <param name="degree" type="integer" optional="true" value="3" label="Degree of the polynomial (polynomial kernel only)"/>
183 <param name="coef0" type="integer" optional="true" value="1" label="Zero coefficient (polynomial and sigmoid kernels only)"/> 192 <param name="coef0" type="integer" optional="true" value="1" label="Zero coefficient (polynomial and sigmoid kernels only)"/>
184 <!--param name="kernel_params" type="dict" optional="true" value="None" label=""/--> 193 <!--param name="kernel_params" type="dict" optional="true" value="None" label=""/-->
185 </when> 194 </when>
186 <when value="MiniBatchKMeans"> 195 <when value="MiniBatchKMeans">
187 <expand macro="n_clusters"/> 196 <expand macro="n_clusters" default_label="8"/>
188 <expand macro="init"/> 197 <expand macro="init"/>
189 <expand macro="n_init"/> <!-- default to 3--> 198 <expand macro="n_init"/> <!-- default to 3-->
190 <expand macro="max_iter"/> <!--default to 100--> 199 <expand macro="max_iter"/> <!--default to 100-->
191 <expand macro="tol"/> <!--default = 0.0--> 200 <expand macro="tol"/> <!--default = 0.0-->
192 <expand macro="random_state"/> 201 <expand macro="random_state"/>
203 </outputs> 212 </outputs>
204 <tests> 213 <tests>
205 <test> 214 <test>
206 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> 215 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
207 <param name="selected_algorithm" value="KMeans"/> 216 <param name="selected_algorithm" value="KMeans"/>
217 <param name="start_column" value="2" />
218 <param name="end_column" value="4" />
208 <param name="n_clusters" value="4" /> 219 <param name="n_clusters" value="4" />
209 <param name="init" value="k-means++" /> 220 <param name="init" value="k-means++" />
210 <param name="random_state" value="100"/> 221 <param name="random_state" value="100"/>
211 <output name="outfile" file="cluster_result01.txt"/> 222 <output name="outfile" file="cluster_result01.txt"/>
212 </test> 223 </test>
213 <test> 224 <test>
214 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> 225 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
215 <param name="selected_algorithm" value="KMeans"/> 226 <param name="selected_algorithm" value="KMeans"/>
216 <param name="n_clusters" value="6" /> 227 <param name="start_column" value="2" />
228 <param name="end_column" value="4" />
229 <param name="n_clusters" value="4" />
217 <param name="init" value="random" /> 230 <param name="init" value="random" />
218 <param name="random_state" value="100"/> 231 <param name="random_state" value="100"/>
219 <output name="outfile" file="cluster_result02.txt"/> 232 <output name="outfile" file="cluster_result02.txt"/>
220 </test> 233 </test>
221 <test> 234 <test>
222 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> 235 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
223 <param name="selected_algorithm" value="DBSCAN"/> 236 <param name="selected_algorithm" value="DBSCAN"/>
237 <param name="start_column" value="2" />
238 <param name="end_column" value="4" />
224 <param name="algorithm" value="kd_tree"/> 239 <param name="algorithm" value="kd_tree"/>
225 <param name="leaf_size" value="10"/> 240 <param name="leaf_size" value="10"/>
226 <param name="eps" value="1.0"/> 241 <param name="eps" value="1.0"/>
227 <output name="outfile" file="cluster_result03.txt"/> 242 <output name="outfile" file="cluster_result03.txt"/>
228 </test> 243 </test>
229 <test> 244 <test>
230 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> 245 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
231 <param name="selected_algorithm" value="Birch"/> 246 <param name="selected_algorithm" value="Birch"/>
232 <param name="n_clusters" value="5"/> 247 <param name="start_column" value="2" />
248 <param name="end_column" value="4" />
249 <param name="n_clusters" value="4"/>
233 <param name="threshold" value="0.008"/> 250 <param name="threshold" value="0.008"/>
234 <output name="outfile" file="cluster_result04.txt"/> 251 <output name="outfile" file="cluster_result04.txt"/>
235 </test> 252 </test>
236 <test> 253 <test>
237 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> 254 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
238 <param name="selected_algorithm" value="Birch"/> 255 <param name="selected_algorithm" value="Birch"/>
256 <param name="start_column" value="2" />
257 <param name="end_column" value="4" />
239 <param name="branching_factor" value="20"/> 258 <param name="branching_factor" value="20"/>
240 <output name="outfile" file="cluster_result05.txt"/> 259 <output name="outfile" file="cluster_result05.txt"/>
241 </test> 260 </test>
242 <test> 261 <test>
243 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> 262 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
244 <param name="selected_algorithm" value="AffinityPropagation"/> 263 <param name="selected_algorithm" value="AffinityPropagation"/>
264 <param name="start_column" value="2" />
265 <param name="end_column" value="4" />
245 <param name="affinity" value="euclidean"/> 266 <param name="affinity" value="euclidean"/>
246 <param name="copy" value="false"/> 267 <param name="copy" value="false"/>
247 <output name="outfile" file="cluster_result06.txt"/> 268 <output name="outfile" file="cluster_result06.txt"/>
248 </test> 269 </test>
249 <test> 270 <test>
250 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> 271 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
251 <param name="selected_algorithm" value="AffinityPropagation"/> 272 <param name="selected_algorithm" value="AffinityPropagation"/>
273 <param name="start_column" value="2" />
274 <param name="end_column" value="4" />
252 <param name="damping" value="0.8"/> 275 <param name="damping" value="0.8"/>
253 <output name="outfile" file="cluster_result07.txt"/> 276 <output name="outfile" file="cluster_result07.txt"/>
254 </test> 277 </test>
255 <test> 278 <test>
256 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> 279 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
257 <param name="selected_algorithm" value="MeanShift"/> 280 <param name="selected_algorithm" value="MeanShift"/>
281 <param name="start_column" value="2" />
282 <param name="end_column" value="4" />
258 <param name="min_bin_freq" value="3"/> 283 <param name="min_bin_freq" value="3"/>
259 <output name="outfile" file="cluster_result08.txt"/> 284 <output name="outfile" file="cluster_result08.txt"/>
260 </test> 285 </test>
261 <test> 286 <test>
262 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> 287 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
263 <param name="selected_algorithm" value="MeanShift"/> 288 <param name="selected_algorithm" value="MeanShift"/>
289 <param name="start_column" value="2" />
290 <param name="end_column" value="4" />
264 <param name="cluster_all" value="False"/> 291 <param name="cluster_all" value="False"/>
265 <output name="outfile" file="cluster_result09.txt"/> 292 <output name="outfile" file="cluster_result09.txt"/>
266 </test> 293 </test>
267 <test> 294 <test>
268 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> 295 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
269 <param name="selected_algorithm" value="AgglomerativeClustering"/> 296 <param name="selected_algorithm" value="AgglomerativeClustering"/>
297 <param name="start_column" value="2" />
298 <param name="end_column" value="4" />
270 <param name="affinity" value="euclidean"/> 299 <param name="affinity" value="euclidean"/>
271 <param name="linkage" value="average"/> 300 <param name="linkage" value="average"/>
272 <output name="outfile" file="cluster_result10.txt"/> 301 <output name="outfile" file="cluster_result10.txt"/>
273 </test> 302 </test>
274 <test> 303 <test>
275 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> 304 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
276 <param name="selected_algorithm" value="AgglomerativeClustering"/> 305 <param name="selected_algorithm" value="AgglomerativeClustering"/>
306 <param name="start_column" value="2" />
307 <param name="end_column" value="4" />
277 <param name="linkage" value="complete"/> 308 <param name="linkage" value="complete"/>
278 <param name="n_clusters" value="5"/> 309 <param name="n_clusters" value="4"/>
279 <output name="outfile" file="cluster_result11.txt"/> 310 <output name="outfile" file="cluster_result11.txt"/>
280 </test> 311 </test>
281 <test> 312 <test>
282 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> 313 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
283 <param name="selected_algorithm" value="SpectralClustering"/> 314 <param name="selected_algorithm" value="SpectralClustering"/>
315 <param name="start_column" value="2" />
316 <param name="end_column" value="4" />
284 <param name="eigen_solver" value="arpack"/> 317 <param name="eigen_solver" value="arpack"/>
285 <param name="n_neighbors" value="12"/> 318 <param name="n_neighbors" value="12"/>
286 <param name="n_clusters" value="7"/> 319 <param name="n_clusters" value="4"/>
287 <param name="assign_labels" value="discretize"/> 320 <param name="assign_labels" value="discretize"/>
288 <param name="random_state" value="100"/> 321 <param name="random_state" value="100"/>
289 <output name="outfile" file="cluster_result12.txt"/> 322 <output name="outfile" file="cluster_result12.txt"/>
290 </test> 323 </test>
291 <test> 324 <test>
292 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> 325 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
293 <param name="selected_algorithm" value="SpectralClustering"/> 326 <param name="selected_algorithm" value="SpectralClustering"/>
327 <param name="start_column" value="2" />
328 <param name="end_column" value="4" />
294 <param name="assign_labels" value="discretize"/> 329 <param name="assign_labels" value="discretize"/>
295 <param name="random_state" value="100"/> 330 <param name="random_state" value="100"/>
296 <param name="degree" value="2"/> 331 <param name="degree" value="2"/>
297 <output name="outfile" file="cluster_result13.txt"/> 332 <output name="outfile" file="cluster_result13.txt"/>
298 </test> 333 </test>
299 <test> 334 <test>
300 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> 335 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
301 <param name="selected_algorithm" value="MiniBatchKMeans"/> 336 <param name="selected_algorithm" value="MiniBatchKMeans"/>
337 <param name="start_column" value="2" />
338 <param name="end_column" value="4" />
302 <param name="tol" value="0.5"/> 339 <param name="tol" value="0.5"/>
303 <param name="random_state" value="100"/> 340 <param name="random_state" value="100"/>
304 <output name="outfile" file="cluster_result14.txt"/> 341 <output name="outfile" file="cluster_result14.txt"/>
305 </test> 342 </test>
306 <test> 343 <test>
307 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> 344 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
308 <param name="selected_algorithm" value="MiniBatchKMeans"/> 345 <param name="selected_algorithm" value="MiniBatchKMeans"/>
309 <param name="n_init" value="5"/> 346 <param name="n_init" value="5"/>
347 <param name="start_column" value="2" />
348 <param name="end_column" value="4" />
310 <param name="batch_size" value="10"/> 349 <param name="batch_size" value="10"/>
311 <param name="n_clusters" value="3"/> 350 <param name="n_clusters" value="4"/>
312 <param name="random_state" value="100"/> 351 <param name="random_state" value="100"/>
313 <param name="reassignment_ratio" value="1.0"/> 352 <param name="reassignment_ratio" value="1.0"/>
314 <output name="outfile" file="cluster_result15.txt"/> 353 <output name="outfile" file="cluster_result15.txt"/>
315 </test> 354 </test>
316 <test> 355 <test>
317 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> 356 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
318 <param name="selected_algorithm" value="KMeans"/> 357 <param name="selected_algorithm" value="KMeans"/>
319 <param name="start_column" value="3" /> 358 <param name="start_column" value="1" />
320 <param name="end_column" value="4" /> 359 <param name="end_column" value="1" />
321 <param name="n_clusters" value="6" /> 360 <param name="n_clusters" value="4" />
322 <param name="init" value="random" />
323 <param name="random_state" value="100"/> 361 <param name="random_state" value="100"/>
324 <output name="outfile" file="cluster_result16.txt"/> 362 <output name="outfile" file="cluster_result16.txt"/>
325 </test> 363 </test>
326 </tests> 364 </tests>
327 <help><![CDATA[ 365 <help><![CDATA[