Mercurial > repos > bgruening > sklearn_numeric_clustering
comparison numeric_clustering.xml @ 0:dac8a9712939 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tools/sklearn commit a6e80305ed0892c8163d690a2d376d6b454824de-dirty
| author | bgruening |
|---|---|
| date | Mon, 02 May 2016 16:16:42 -0400 |
| parents | |
| children | 4fcf8b052fed |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:dac8a9712939 |
|---|---|
| 1 <tool id="sklearn_numeric_clustering" name="Numeric Clustering" version="@VERSION@"> | |
| 2 <description></description> | |
| 3 <expand macro="python_requirements"/> | |
| 4 <expand macro="macro_stdio"/> | |
| 5 <macros> | |
| 6 <import>main_macros.xml</import> | |
| 7 </macros> | |
| 8 <version_command>echo "@VERSION@"</version_command> | |
| 9 <command><![CDATA[ | |
| 10 python "$cluster_script" '$inputs' | |
| 11 ]]> | |
| 12 </command> | |
| 13 <configfiles> | |
| 14 <inputs name="inputs"/> | |
| 15 <configfile name="cluster_script"> | |
| 16 <![CDATA[ | |
| 17 import sys | |
| 18 import json | |
| 19 import numpy as np | |
| 20 import sklearn.cluster | |
| 21 import pandas | |
| 22 from sklearn import metrics | |
| 23 from scipy.io import mmread | |
| 24 | |
| 25 input_json_path = sys.argv[1] | |
| 26 params = json.load(open(input_json_path, "r")) | |
| 27 | |
| 28 selected_algorithm = params["input_types"]["algorithm_options"]["selected_algorithm"] | |
| 29 | |
| 30 my_class = getattr(sklearn.cluster, selected_algorithm) | |
| 31 cluster_object = my_class() | |
| 32 options = params["input_types"]["algorithm_options"]["options"] | |
| 33 | |
| 34 cluster_object.set_params(**options) | |
| 35 | |
| 36 #if $input_types.selected_input_type == "sparse": | |
| 37 data_matrix = mmread(open("$infile", 'r')) | |
| 38 #else: | |
| 39 data = pandas.read_csv("$infile", sep='\t', header=0, index_col=None, parse_dates=True, encoding=None, tupleize_cols=False ) | |
| 40 | |
| 41 start_column = $input_types.start_column | |
| 42 end_column = $input_types.end_column | |
| 43 | |
| 44 if end_column and start_column: | |
| 45 if end_column >= start_column: | |
| 46 data_matrix = data.values[:, start_column-1:end_column] | |
| 47 else: | |
| 48 data_matrix = data.values | |
| 49 else: | |
| 50 data_matrix = data.values | |
| 51 #end if | |
| 52 | |
| 53 prediction = cluster_object.fit_predict( data_matrix ) | |
| 54 | |
| 55 if len(np.unique(prediction)) > 1: | |
| 56 silhouette_score = metrics.silhouette_score(data_matrix,prediction,metric='euclidean') | |
| 57 else: | |
| 58 silhouette_score = -1 | |
| 59 sys.stdout.write('silhouette score:' + '\t' + str(silhouette_score) + '\n') | |
| 60 | |
| 61 prediction_df = pandas.DataFrame(prediction) | |
| 62 | |
| 63 #if $input_types.selected_input_type == "sparse": | |
| 64 res = prediction_df | |
| 65 #else: | |
| 66 res = pandas.concat([data, prediction_df], axis=1) | |
| 67 #end if | |
| 68 | |
| 69 res.to_csv(path_or_buf = "$outfile", sep="\t", index=False, header=False) | |
| 70 ]]> | |
| 71 </configfile> | |
| 72 </configfiles> | |
| 73 <inputs> | |
| 74 <conditional name="input_types"> | |
| 75 <param name="selected_input_type" type="select" label="Select the format of input data"> | |
| 76 <option value="tabular" selected="true">Tabular Format (tabular, txt)</option> | |
| 77 <option value="sparse">Sparse Vector Representation (mtx)</option> | |
| 78 </param> | |
| 79 <when value="sparse"> | |
| 80 <param name="infile" type="data" format="txt" label="Sparse vector (scipy.sparse.csr_matrix) file:" help="The following clustering algorithms support sparse matrix operations: ''Birch'', ''DBSCAN'', ''KMeans'', ''Mini BatchK Means'', and ''Spectral Clustering''. If your data is in tabular format, please use other clustering algorithms."/> | |
| 81 <expand macro="clustering_algorithms_options"/> | |
| 82 </when> | |
| 83 <when value="tabular"> | |
| 84 <param name="infile" type="data" format="tabular" label="Data file with numeric values"/> | |
| 85 <param name="start_column" type="data_column" data_ref="infile" optional="True" label="Select a subset of data. Start column:" /> | |
| 86 <param name="end_column" type="data_column" data_ref="infile" optional="True" label="End column:" /> | |
| 87 <!--expand macro="clustering_algorithms_options"--> | |
| 88 <conditional name="algorithm_options"> | |
| 89 <param name="selected_algorithm" type="select" label="Clustering Algorithm"> | |
| 90 <option value="AgglomerativeClustering">Hierarchical Agglomerative Clustering</option> | |
| 91 <option value="AffinityPropagation">Affinity Propagation</option> | |
| 92 <option value="SpectralClustering">Spectral Clustering</option> | |
| 93 <option value="MiniBatchKMeans">Mini Batch KMeans</option> | |
| 94 <option value="MeanShift">MeanShift</option> | |
| 95 <option value="KMeans">KMeans</option> | |
| 96 <option value="DBSCAN">DBSCAN</option> | |
| 97 <option value="Birch">Birch</option> | |
| 98 </param> | |
| 99 <when value="KMeans"> | |
| 100 <expand macro="kmeans_advanced_options"/> | |
| 101 </when> | |
| 102 <when value="DBSCAN"> | |
| 103 <expand macro="dbscan_advanced_options"/> | |
| 104 </when> | |
| 105 <when value="Birch"> | |
| 106 <expand macro="birch_advanced_options"/> | |
| 107 </when> | |
| 108 <when value="SpectralClustering"> | |
| 109 <expand macro="spectral_clustering_advanced_options"/> | |
| 110 </when> | |
| 111 <when value="MiniBatchKMeans"> | |
| 112 <expand macro="minibatch_kmeans_advanced_options"/> | |
| 113 </when> | |
| 114 <when value="AffinityPropagation"> | |
| 115 <section name="options" title="Advanced Options" expanded="False"> | |
| 116 <param argument="damping" type="float" optional="true" value="0.5" label="Damping factor" help="Damping factor between 0.5 and 1."/> | |
| 117 <expand macro="max_iter" default_value="200"/> | |
| 118 <param argument="convergence_iter" type="integer" optional="true" value="15" label="Number of iterations at each convergence step" help="Number of iterations with no change in the number of estimated clusters that stops the convergence."/> | |
| 119 <param argument="copy" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" label="Copy" help="If False, the affinity matrix is modified inplace by the algorithm, for memory efficiency."/> | |
| 120 <!--param argument="preference"/--> | |
| 121 <param argument="affinity" type="select" label="Affinity" help="Affinity to use; euclidean uses the negative squared euclidean distance between points."> | |
| 122 <option value="euclidean">Euclidean</option> | |
| 123 <option value="precomputed">precomputed</option> | |
| 124 </param> | |
| 125 </section> | |
| 126 </when> | |
| 127 <when value="MeanShift"> | |
| 128 <section name="options" title="Advanced Options" expanded="False"> | |
| 129 <param argument="bandwidth" type="float" optional="true" value="" label="Kernel bandwidth" help="Bandwidth used in the RBF kernel. If not given, it will be computed using a heuristic based on the median of all pairwise distances."/> | |
| 130 <!--param argument="seeds"/--> | |
| 131 <param argument="bin_seeding" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" label="Discretize initial kernel locations" help="If true, initial kernel locations are the bins grid whose coarseness corresponds to the bandwidth, speeding up the algorithm."/> | |
| 132 <param argument="min_bin_freq" type="integer" optional="true" value="1" label="Minimum number of seeds per bin" help="To speed up the algorithm, accept only those bins with at least min_bin_freq points as seeds."/> | |
| 133 <param argument="cluster_all" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" label="Cluster all" help="If true, all points (including orphans) are clustered. If false, orphans are given cluster label -1."/> | |
| 134 </section> | |
| 135 </when> | |
| 136 <when value="AgglomerativeClustering"> | |
| 137 <section name="options" title="Advanced Options" expanded="False"> | |
| 138 <expand macro="n_clusters" default_value="2" /> | |
| 139 <param argument="affinity" type="select" label="Affinity" help="Metric used to compute the linkage. If linkage is ''ward'', only ''euclidean'' is accepted."> | |
| 140 <option value="euclidean">Euclidean</option> | |
| 141 <option value="manhattan">Manhattan</option> | |
| 142 <option value="l1">L1</option> | |
| 143 <option value="l2">L2</option> | |
| 144 <option value="cosine">cosine</option> | |
| 145 <option value="precomputed">precomputed</option> | |
| 146 </param> | |
| 147 <!--param argument="memory"--> | |
| 148 <!--param argument="connectivity"--> | |
| 149 <!--param argument="n_components"/--> | |
| 150 <!--param argument="compute_full_tree"--> | |
| 151 <param argument="linkage" type="select" optional="true" label="Linkage" help=""> | |
| 152 <option value="ward" selected="true">ward</option> | |
| 153 <option value="complete">complete</option> | |
| 154 <option value="average">average</option> | |
| 155 </param> | |
| 156 <!--param argument="pooling_func"--> | |
| 157 </section> | |
| 158 </when> | |
| 159 </conditional> | |
| 160 </when> | |
| 161 </conditional> | |
| 162 </inputs> | |
| 163 <outputs> | |
| 164 <data format="tabular" name="outfile"/> | |
| 165 </outputs> | |
| 166 <tests> | |
| 167 <test> | |
| 168 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | |
| 169 <param name="selected_input_type" value="tabular"/> | |
| 170 <param name="selected_algorithm" value="KMeans"/> | |
| 171 <param name="start_column" value="2" /> | |
| 172 <param name="end_column" value="4" /> | |
| 173 <param name="n_clusters" value="4" /> | |
| 174 <param name="init" value="k-means++" /> | |
| 175 <param name="random_state" value="100"/> | |
| 176 <output name="outfile" file="cluster_result01.txt"/> | |
| 177 </test> | |
| 178 <test> | |
| 179 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | |
| 180 <param name="selected_algorithm" value="KMeans"/> | |
| 181 <param name="selected_input_type" value="tabular"/> | |
| 182 <param name="start_column" value="2" /> | |
| 183 <param name="end_column" value="4" /> | |
| 184 <param name="n_clusters" value="4" /> | |
| 185 <param name="init" value="random" /> | |
| 186 <param name="random_state" value="100"/> | |
| 187 <output name="outfile" file="cluster_result02.txt"/> | |
| 188 </test> | |
| 189 <test> | |
| 190 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | |
| 191 <param name="selected_algorithm" value="DBSCAN"/> | |
| 192 <param name="selected_input_type" value="tabular"/> | |
| 193 <param name="start_column" value="2" /> | |
| 194 <param name="end_column" value="4" /> | |
| 195 <param name="algorithm" value="kd_tree"/> | |
| 196 <param name="leaf_size" value="10"/> | |
| 197 <param name="eps" value="1.0"/> | |
| 198 <output name="outfile" file="cluster_result03.txt"/> | |
| 199 </test> | |
| 200 <test> | |
| 201 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | |
| 202 <param name="selected_algorithm" value="Birch"/> | |
| 203 <param name="selected_input_type" value="tabular"/> | |
| 204 <param name="start_column" value="2" /> | |
| 205 <param name="end_column" value="4" /> | |
| 206 <param name="n_clusters" value="4"/> | |
| 207 <param name="threshold" value="0.008"/> | |
| 208 <output name="outfile" file="cluster_result04.txt"/> | |
| 209 </test> | |
| 210 <test> | |
| 211 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | |
| 212 <param name="selected_algorithm" value="Birch"/> | |
| 213 <param name="selected_input_type" value="tabular"/> | |
| 214 <param name="start_column" value="2" /> | |
| 215 <param name="end_column" value="4" /> | |
| 216 <param name="branching_factor" value="20"/> | |
| 217 <output name="outfile" file="cluster_result05.txt"/> | |
| 218 </test> | |
| 219 <test> | |
| 220 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | |
| 221 <param name="selected_algorithm" value="AffinityPropagation"/> | |
| 222 <param name="selected_input_type" value="tabular"/> | |
| 223 <param name="start_column" value="2" /> | |
| 224 <param name="end_column" value="4" /> | |
| 225 <param name="affinity" value="euclidean"/> | |
| 226 <param name="copy" value="false"/> | |
| 227 <output name="outfile" file="cluster_result06.txt"/> | |
| 228 </test> | |
| 229 <test> | |
| 230 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | |
| 231 <param name="selected_algorithm" value="AffinityPropagation"/> | |
| 232 <param name="selected_input_type" value="tabular"/> | |
| 233 <param name="start_column" value="2" /> | |
| 234 <param name="end_column" value="4" /> | |
| 235 <param name="damping" value="0.8"/> | |
| 236 <output name="outfile" file="cluster_result07.txt"/> | |
| 237 </test> | |
| 238 <test> | |
| 239 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | |
| 240 <param name="selected_algorithm" value="MeanShift"/> | |
| 241 <param name="selected_input_type" value="tabular"/> | |
| 242 <param name="start_column" value="2" /> | |
| 243 <param name="end_column" value="4" /> | |
| 244 <param name="min_bin_freq" value="3"/> | |
| 245 <output name="outfile" file="cluster_result08.txt"/> | |
| 246 </test> | |
| 247 <test> | |
| 248 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | |
| 249 <param name="selected_algorithm" value="MeanShift"/> | |
| 250 <param name="selected_input_type" value="tabular"/> | |
| 251 <param name="start_column" value="2" /> | |
| 252 <param name="end_column" value="4" /> | |
| 253 <param name="cluster_all" value="False"/> | |
| 254 <output name="outfile" file="cluster_result09.txt"/> | |
| 255 </test> | |
| 256 <test> | |
| 257 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | |
| 258 <param name="selected_algorithm" value="AgglomerativeClustering"/> | |
| 259 <param name="selected_input_type" value="tabular"/> | |
| 260 <param name="start_column" value="2" /> | |
| 261 <param name="end_column" value="4" /> | |
| 262 <param name="affinity" value="euclidean"/> | |
| 263 <param name="linkage" value="average"/> | |
| 264 <param name="n_clusters" value="4"/> | |
| 265 <output name="outfile" file="cluster_result10.txt"/> | |
| 266 </test> | |
| 267 <test> | |
| 268 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | |
| 269 <param name="selected_algorithm" value="AgglomerativeClustering"/> | |
| 270 <param name="selected_input_type" value="tabular"/> | |
| 271 <param name="start_column" value="2" /> | |
| 272 <param name="end_column" value="4" /> | |
| 273 <param name="linkage" value="complete"/> | |
| 274 <param name="n_clusters" value="4"/> | |
| 275 <output name="outfile" file="cluster_result11.txt"/> | |
| 276 </test> | |
| 277 <test> | |
| 278 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | |
| 279 <param name="selected_algorithm" value="SpectralClustering"/> | |
| 280 <param name="selected_input_type" value="tabular"/> | |
| 281 <param name="start_column" value="2" /> | |
| 282 <param name="end_column" value="4" /> | |
| 283 <param name="eigen_solver" value="arpack"/> | |
| 284 <param name="n_neighbors" value="12"/> | |
| 285 <param name="n_clusters" value="4"/> | |
| 286 <param name="assign_labels" value="discretize"/> | |
| 287 <param name="random_state" value="100"/> | |
| 288 <output name="outfile" file="empty_file.txt" compare="contains"/> | |
| 289 </test> | |
| 290 <test> | |
| 291 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | |
| 292 <param name="selected_algorithm" value="SpectralClustering"/> | |
| 293 <param name="selected_input_type" value="tabular"/> | |
| 294 <param name="start_column" value="2" /> | |
| 295 <param name="end_column" value="4" /> | |
| 296 <param name="assign_labels" value="discretize"/> | |
| 297 <param name="random_state" value="100"/> | |
| 298 <param name="degree" value="2"/> | |
| 299 <output name="outfile" file="empty_file.txt" compare="contains"/> | |
| 300 </test> | |
| 301 <test> | |
| 302 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | |
| 303 <param name="selected_algorithm" value="MiniBatchKMeans"/> | |
| 304 <param name="selected_input_type" value="tabular"/> | |
| 305 <param name="start_column" value="2" /> | |
| 306 <param name="end_column" value="4" /> | |
| 307 <param name="tol" value="0.5"/> | |
| 308 <param name="random_state" value="100"/> | |
| 309 <output name="outfile" file="cluster_result14.txt"/> | |
| 310 </test> | |
| 311 <test> | |
| 312 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | |
| 313 <param name="selected_algorithm" value="MiniBatchKMeans"/> | |
| 314 <param name="selected_input_type" value="tabular"/> | |
| 315 <param name="n_init" value="5"/> | |
| 316 <param name="start_column" value="2" /> | |
| 317 <param name="end_column" value="4" /> | |
| 318 <param name="batch_size" value="10"/> | |
| 319 <param name="n_clusters" value="4"/> | |
| 320 <param name="random_state" value="100"/> | |
| 321 <param name="reassignment_ratio" value="1.0"/> | |
| 322 <output name="outfile" file="cluster_result15.txt"/> | |
| 323 </test> | |
| 324 <test> | |
| 325 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | |
| 326 <param name="selected_algorithm" value="KMeans"/> | |
| 327 <param name="selected_input_type" value="tabular"/> | |
| 328 <param name="start_column" value="1" /> | |
| 329 <param name="end_column" value="1" /> | |
| 330 <param name="n_clusters" value="4" /> | |
| 331 <param name="random_state" value="100"/> | |
| 332 <output name="outfile" file="cluster_result16.txt"/> | |
| 333 </test> | |
| 334 <test> | |
| 335 <param name="infile" value="sparse.mtx" ftype="txt"/> | |
| 336 <param name="selected_input_type" value="sparse"/> | |
| 337 <param name="selected_algorithm" value="KMeans"/> | |
| 338 <param name="n_clusters" value="2" /> | |
| 339 <param name="init" value="k-means++" /> | |
| 340 <param name="random_state" value="100"/> | |
| 341 <output name="outfile" file="cluster_result17.txt"/> | |
| 342 </test> | |
| 343 <test> | |
| 344 <param name="infile" value="sparse.mtx" ftype="txt"/> | |
| 345 <param name="selected_algorithm" value="DBSCAN"/> | |
| 346 <param name="selected_input_type" value="sparse"/> | |
| 347 <param name="algorithm" value="kd_tree"/> | |
| 348 <param name="leaf_size" value="10"/> | |
| 349 <param name="eps" value="1.0"/> | |
| 350 <output name="outfile" file="cluster_result18.txt"/> | |
| 351 </test> | |
| 352 <test> | |
| 353 <param name="infile" value="sparse.mtx" ftype="txt"/> | |
| 354 <param name="selected_algorithm" value="Birch"/> | |
| 355 <param name="selected_input_type" value="sparse"/> | |
| 356 <param name="n_clusters" value="2"/> | |
| 357 <param name="threshold" value="0.008"/> | |
| 358 <output name="outfile" file="cluster_result19.txt"/> | |
| 359 </test> | |
| 360 <test> | |
| 361 <param name="infile" value="sparse.mtx" ftype="txt"/> | |
| 362 <param name="selected_algorithm" value="MiniBatchKMeans"/> | |
| 363 <param name="selected_input_type" value="sparse"/> | |
| 364 <param name="n_init" value="5"/> | |
| 365 <param name="batch_size" value="10"/> | |
| 366 <param name="n_clusters" value="2"/> | |
| 367 <param name="random_state" value="100"/> | |
| 368 <param name="reassignment_ratio" value="1.0"/> | |
| 369 <output name="outfile" file="cluster_result20.txt"/> | |
| 370 </test> | |
| 371 <test> | |
| 372 <param name="infile" value="sparse.mtx" ftype="txt"/> | |
| 373 <param name="selected_algorithm" value="SpectralClustering"/> | |
| 374 <param name="selected_input_type" value="sparse"/> | |
| 375 <param name="assign_labels" value="discretize"/> | |
| 376 <param name="n_clusters" value="2"/> | |
| 377 <param name="random_state" value="100"/> | |
| 378 <param name="degree" value="2"/> | |
| 379 <output name="outfile" file="cluster_result21.txt"/> | |
| 380 </test> | |
| 381 </tests> | |
| 382 <help><![CDATA[ | |
| 383 **What it does** | |
| 384 This tool offers different clustering algorithms which are provided by | |
| 385 scikit-learn to find similarities among samples and cluster the samples based on these similarities. | |
| 386 ]]></help> | |
| 387 <expand macro="sklearn_citation"/> | |
| 388 </tool> |
