Mercurial > repos > bgruening > sklearn_numeric_clustering
comparison numeric_clustering.xml @ 19:8a7b460ab534 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 5d71c93a3dd804b1469852240a86021ab9130364
| author | bgruening |
|---|---|
| date | Mon, 09 Jul 2018 14:27:04 -0400 |
| parents | 4edccd1eaaf0 |
| children | 60d1b396cea2 |
comparison
equal
deleted
inserted
replaced
| 18:06d67d77907c | 19:8a7b460ab534 |
|---|---|
| 20 import sklearn.cluster | 20 import sklearn.cluster |
| 21 import pandas | 21 import pandas |
| 22 from sklearn import metrics | 22 from sklearn import metrics |
| 23 from scipy.io import mmread | 23 from scipy.io import mmread |
| 24 | 24 |
| 25 @COLUMNS_FUNCTION@ | |
| 26 | |
| 25 input_json_path = sys.argv[1] | 27 input_json_path = sys.argv[1] |
| 26 params = json.load(open(input_json_path, "r")) | 28 params = json.load(open(input_json_path, "r")) |
| 27 | 29 |
| 28 selected_algorithm = params["input_types"]["algorithm_options"]["selected_algorithm"] | 30 selected_algorithm = params["input_types"]["algorithm_options"]["selected_algorithm"] |
| 29 | 31 |
| 35 | 37 |
| 36 #if $input_types.selected_input_type == "sparse": | 38 #if $input_types.selected_input_type == "sparse": |
| 37 data_matrix = mmread(open("$infile", 'r')) | 39 data_matrix = mmread(open("$infile", 'r')) |
| 38 #else: | 40 #else: |
| 39 data = pandas.read_csv("$infile", sep='\t', header=0, index_col=None, parse_dates=True, encoding=None, tupleize_cols=False ) | 41 data = pandas.read_csv("$infile", sep='\t', header=0, index_col=None, parse_dates=True, encoding=None, tupleize_cols=False ) |
| 40 | 42 header = 'infer' if params["input_types"]["header"] else None |
| 41 start_column = $input_types.start_column | 43 column_option = params["input_types"]["column_selector_options"]["selected_column_selector_option"] |
| 42 end_column = $input_types.end_column | 44 if column_option in ["by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name"]: |
| 43 | 45 c = params["input_types"]["column_selector_options"]["col"] |
| 44 if end_column and start_column: | |
| 45 if end_column >= start_column: | |
| 46 data_matrix = data.values[:, start_column-1:end_column] | |
| 47 else: | |
| 48 data_matrix = data.values | |
| 49 else: | 46 else: |
| 50 data_matrix = data.values | 47 c = None |
| 48 data_matrix = read_columns( | |
| 49 "$infile", | |
| 50 c = c, | |
| 51 c_option = column_option, | |
| 52 sep='\t', | |
| 53 header=header, | |
| 54 parse_dates=True, | |
| 55 encoding=None, | |
| 56 tupleize_cols=False | |
| 57 ) | |
| 51 #end if | 58 #end if |
| 52 | 59 |
| 53 prediction = cluster_object.fit_predict( data_matrix ) | 60 prediction = cluster_object.fit_predict( data_matrix ) |
| 54 | 61 |
| 55 if len(np.unique(prediction)) > 1: | 62 if len(np.unique(prediction)) > 1: |
| 80 <param name="infile" type="data" format="txt" label="Sparse vector (scipy.sparse.csr_matrix) file:" help="The following clustering algorithms support sparse matrix operations: ''Birch'', ''DBSCAN'', ''KMeans'', ''Mini BatchK Means'', and ''Spectral Clustering''. If your data is in tabular format, please use other clustering algorithms."/> | 87 <param name="infile" type="data" format="txt" label="Sparse vector (scipy.sparse.csr_matrix) file:" help="The following clustering algorithms support sparse matrix operations: ''Birch'', ''DBSCAN'', ''KMeans'', ''Mini BatchK Means'', and ''Spectral Clustering''. If your data is in tabular format, please use other clustering algorithms."/> |
| 81 <expand macro="clustering_algorithms_options"/> | 88 <expand macro="clustering_algorithms_options"/> |
| 82 </when> | 89 </when> |
| 83 <when value="tabular"> | 90 <when value="tabular"> |
| 84 <param name="infile" type="data" format="tabular" label="Data file with numeric values"/> | 91 <param name="infile" type="data" format="tabular" label="Data file with numeric values"/> |
| 85 <param name="start_column" type="data_column" data_ref="infile" optional="True" label="Select a subset of data. Start column:" /> | 92 <param name="header" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="True" label="Does the dataset contain header:" /> |
| 86 <param name="end_column" type="data_column" data_ref="infile" optional="True" label="End column:" /> | 93 <conditional name="column_selector_options"> |
| 94 <expand macro="samples_column_selector_options" col_name="col" multiple="true" infile="infile"/> | |
| 95 </conditional> | |
| 87 <!--expand macro="clustering_algorithms_options"--> | 96 <!--expand macro="clustering_algorithms_options"--> |
| 88 <conditional name="algorithm_options"> | 97 <conditional name="algorithm_options"> |
| 89 <param name="selected_algorithm" type="select" label="Clustering Algorithm"> | 98 <param name="selected_algorithm" type="select" label="Clustering Algorithm"> |
| 90 <option value="AgglomerativeClustering">Hierarchical Agglomerative Clustering</option> | 99 <option value="AgglomerativeClustering">Hierarchical Agglomerative Clustering</option> |
| 91 <option value="AffinityPropagation">Affinity Propagation</option> | 100 <option value="AffinityPropagation">Affinity Propagation</option> |
| 166 <tests> | 175 <tests> |
| 167 <test> | 176 <test> |
| 168 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | 177 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> |
| 169 <param name="selected_input_type" value="tabular"/> | 178 <param name="selected_input_type" value="tabular"/> |
| 170 <param name="selected_algorithm" value="KMeans"/> | 179 <param name="selected_algorithm" value="KMeans"/> |
| 171 <param name="start_column" value="2" /> | 180 <param name="col" value="2,3,4" /> |
| 172 <param name="end_column" value="4" /> | |
| 173 <param name="n_clusters" value="4" /> | 181 <param name="n_clusters" value="4" /> |
| 174 <param name="init" value="k-means++" /> | 182 <param name="init" value="k-means++" /> |
| 175 <param name="random_state" value="100"/> | 183 <param name="random_state" value="100"/> |
| 176 <output name="outfile" file="cluster_result01.txt"/> | 184 <output name="outfile" file="cluster_result01.txt"/> |
| 177 </test> | 185 </test> |
| 178 <test> | 186 <test> |
| 179 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | 187 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> |
| 180 <param name="selected_algorithm" value="KMeans"/> | 188 <param name="selected_algorithm" value="KMeans"/> |
| 181 <param name="selected_input_type" value="tabular"/> | 189 <param name="selected_input_type" value="tabular"/> |
| 182 <param name="start_column" value="2" /> | 190 <param name="col" value="2,3,4" /> |
| 183 <param name="end_column" value="4" /> | |
| 184 <param name="n_clusters" value="4" /> | 191 <param name="n_clusters" value="4" /> |
| 185 <param name="init" value="random" /> | 192 <param name="init" value="random" /> |
| 186 <param name="random_state" value="100"/> | 193 <param name="random_state" value="100"/> |
| 187 <output name="outfile" file="cluster_result02.txt"/> | 194 <output name="outfile" file="cluster_result02.txt"/> |
| 188 </test> | 195 </test> |
| 189 <test> | 196 <test> |
| 190 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | 197 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> |
| 191 <param name="selected_algorithm" value="DBSCAN"/> | 198 <param name="selected_algorithm" value="DBSCAN"/> |
| 192 <param name="selected_input_type" value="tabular"/> | 199 <param name="selected_input_type" value="tabular"/> |
| 193 <param name="start_column" value="2" /> | 200 <param name="col" value="2,3,4" /> |
| 194 <param name="end_column" value="4" /> | |
| 195 <param name="algorithm" value="kd_tree"/> | 201 <param name="algorithm" value="kd_tree"/> |
| 196 <param name="leaf_size" value="10"/> | 202 <param name="leaf_size" value="10"/> |
| 197 <param name="eps" value="1.0"/> | 203 <param name="eps" value="1.0"/> |
| 198 <output name="outfile" file="cluster_result03.txt"/> | 204 <output name="outfile" file="cluster_result03.txt"/> |
| 199 </test> | 205 </test> |
| 200 <test> | 206 <test> |
| 201 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | 207 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> |
| 202 <param name="selected_algorithm" value="Birch"/> | 208 <param name="selected_algorithm" value="Birch"/> |
| 203 <param name="selected_input_type" value="tabular"/> | 209 <param name="selected_input_type" value="tabular"/> |
| 204 <param name="start_column" value="2" /> | 210 <param name="col" value="2,3,4" /> |
| 205 <param name="end_column" value="4" /> | |
| 206 <param name="n_clusters" value="4"/> | 211 <param name="n_clusters" value="4"/> |
| 207 <param name="threshold" value="0.008"/> | 212 <param name="threshold" value="0.008"/> |
| 208 <output name="outfile" file="cluster_result04.txt"/> | 213 <output name="outfile" file="cluster_result04.txt"/> |
| 209 </test> | 214 </test> |
| 210 <test> | 215 <test> |
| 211 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | 216 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> |
| 212 <param name="selected_algorithm" value="Birch"/> | 217 <param name="selected_algorithm" value="Birch"/> |
| 213 <param name="selected_input_type" value="tabular"/> | 218 <param name="selected_input_type" value="tabular"/> |
| 214 <param name="start_column" value="2" /> | 219 <param name="col" value="2,3,4" /> |
| 215 <param name="end_column" value="4" /> | |
| 216 <param name="branching_factor" value="20"/> | 220 <param name="branching_factor" value="20"/> |
| 217 <output name="outfile" file="cluster_result05.txt"/> | 221 <output name="outfile" file="cluster_result05.txt"/> |
| 218 </test> | 222 </test> |
| 219 <test> | 223 <test> |
| 220 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | 224 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> |
| 221 <param name="selected_algorithm" value="AffinityPropagation"/> | 225 <param name="selected_algorithm" value="AffinityPropagation"/> |
| 222 <param name="selected_input_type" value="tabular"/> | 226 <param name="selected_input_type" value="tabular"/> |
| 223 <param name="start_column" value="2" /> | 227 <param name="col" value="2,3,4" /> |
| 224 <param name="end_column" value="4" /> | |
| 225 <param name="affinity" value="euclidean"/> | 228 <param name="affinity" value="euclidean"/> |
| 226 <param name="copy" value="false"/> | 229 <param name="copy" value="false"/> |
| 227 <output name="outfile" file="cluster_result06.txt"/> | 230 <output name="outfile" file="cluster_result06.txt"/> |
| 228 </test> | 231 </test> |
| 229 <test> | 232 <test> |
| 230 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | 233 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> |
| 231 <param name="selected_algorithm" value="AffinityPropagation"/> | 234 <param name="selected_algorithm" value="AffinityPropagation"/> |
| 232 <param name="selected_input_type" value="tabular"/> | 235 <param name="selected_input_type" value="tabular"/> |
| 233 <param name="start_column" value="2" /> | 236 <param name="col" value="2,3,4" /> |
| 234 <param name="end_column" value="4" /> | |
| 235 <param name="damping" value="0.8"/> | 237 <param name="damping" value="0.8"/> |
| 236 <output name="outfile" file="cluster_result07.txt"/> | 238 <output name="outfile" file="cluster_result07.txt"/> |
| 237 </test> | 239 </test> |
| 238 <test> | 240 <test> |
| 239 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | 241 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> |
| 240 <param name="selected_algorithm" value="MeanShift"/> | 242 <param name="selected_algorithm" value="MeanShift"/> |
| 241 <param name="selected_input_type" value="tabular"/> | 243 <param name="selected_input_type" value="tabular"/> |
| 242 <param name="start_column" value="2" /> | 244 <param name="col" value="2,3,4" /> |
| 243 <param name="end_column" value="4" /> | |
| 244 <param name="min_bin_freq" value="3"/> | 245 <param name="min_bin_freq" value="3"/> |
| 245 <output name="outfile" file="cluster_result08.txt"/> | 246 <output name="outfile" file="cluster_result08.txt"/> |
| 246 </test> | 247 </test> |
| 247 <test> | 248 <test> |
| 248 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | 249 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> |
| 249 <param name="selected_algorithm" value="MeanShift"/> | 250 <param name="selected_algorithm" value="MeanShift"/> |
| 250 <param name="selected_input_type" value="tabular"/> | 251 <param name="selected_input_type" value="tabular"/> |
| 251 <param name="start_column" value="2" /> | 252 <param name="col" value="2,3,4" /> |
| 252 <param name="end_column" value="4" /> | |
| 253 <param name="cluster_all" value="False"/> | 253 <param name="cluster_all" value="False"/> |
| 254 <output name="outfile" file="cluster_result09.txt"/> | 254 <output name="outfile" file="cluster_result09.txt"/> |
| 255 </test> | 255 </test> |
| 256 <test> | 256 <test> |
| 257 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | 257 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> |
| 258 <param name="selected_algorithm" value="AgglomerativeClustering"/> | 258 <param name="selected_algorithm" value="AgglomerativeClustering"/> |
| 259 <param name="selected_input_type" value="tabular"/> | 259 <param name="selected_input_type" value="tabular"/> |
| 260 <param name="start_column" value="2" /> | 260 <param name="col" value="2,3,4" /> |
| 261 <param name="end_column" value="4" /> | |
| 262 <param name="affinity" value="euclidean"/> | 261 <param name="affinity" value="euclidean"/> |
| 263 <param name="linkage" value="average"/> | 262 <param name="linkage" value="average"/> |
| 264 <param name="n_clusters" value="4"/> | 263 <param name="n_clusters" value="4"/> |
| 265 <output name="outfile" file="cluster_result10.txt"/> | 264 <output name="outfile" file="cluster_result10.txt"/> |
| 266 </test> | 265 </test> |
| 267 <test> | 266 <test> |
| 268 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | 267 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> |
| 269 <param name="selected_algorithm" value="AgglomerativeClustering"/> | 268 <param name="selected_algorithm" value="AgglomerativeClustering"/> |
| 270 <param name="selected_input_type" value="tabular"/> | 269 <param name="selected_input_type" value="tabular"/> |
| 271 <param name="start_column" value="2" /> | 270 <param name="col" value="2,3,4" /> |
| 272 <param name="end_column" value="4" /> | |
| 273 <param name="linkage" value="complete"/> | 271 <param name="linkage" value="complete"/> |
| 274 <param name="n_clusters" value="4"/> | 272 <param name="n_clusters" value="4"/> |
| 275 <output name="outfile" file="cluster_result11.txt"/> | 273 <output name="outfile" file="cluster_result11.txt"/> |
| 276 </test> | 274 </test> |
| 277 <test> | 275 <test> |
| 278 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | 276 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> |
| 279 <param name="selected_algorithm" value="SpectralClustering"/> | 277 <param name="selected_algorithm" value="SpectralClustering"/> |
| 280 <param name="selected_input_type" value="tabular"/> | 278 <param name="selected_input_type" value="tabular"/> |
| 281 <param name="start_column" value="2" /> | 279 <param name="col" value="2,3,4" /> |
| 282 <param name="end_column" value="4" /> | |
| 283 <param name="eigen_solver" value="arpack"/> | 280 <param name="eigen_solver" value="arpack"/> |
| 284 <param name="n_neighbors" value="12"/> | 281 <param name="n_neighbors" value="12"/> |
| 285 <param name="n_clusters" value="4"/> | 282 <param name="n_clusters" value="4"/> |
| 286 <param name="assign_labels" value="discretize"/> | 283 <param name="assign_labels" value="discretize"/> |
| 287 <param name="random_state" value="100"/> | 284 <param name="random_state" value="100"/> |
| 289 </test> | 286 </test> |
| 290 <test> | 287 <test> |
| 291 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | 288 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> |
| 292 <param name="selected_algorithm" value="SpectralClustering"/> | 289 <param name="selected_algorithm" value="SpectralClustering"/> |
| 293 <param name="selected_input_type" value="tabular"/> | 290 <param name="selected_input_type" value="tabular"/> |
| 294 <param name="start_column" value="2" /> | 291 <param name="col" value="2,3,4" /> |
| 295 <param name="end_column" value="4" /> | |
| 296 <param name="assign_labels" value="discretize"/> | 292 <param name="assign_labels" value="discretize"/> |
| 297 <param name="random_state" value="100"/> | 293 <param name="random_state" value="100"/> |
| 298 <param name="degree" value="2"/> | 294 <param name="degree" value="2"/> |
| 299 <output name="outfile" file="cluster_result13.txt" compare="sim_size" /> | 295 <output name="outfile" file="cluster_result13.txt" compare="sim_size" /> |
| 300 </test> | 296 </test> |
| 301 <test> | 297 <test> |
| 302 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | 298 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> |
| 303 <param name="selected_algorithm" value="MiniBatchKMeans"/> | 299 <param name="selected_algorithm" value="MiniBatchKMeans"/> |
| 304 <param name="selected_input_type" value="tabular"/> | 300 <param name="selected_input_type" value="tabular"/> |
| 305 <param name="start_column" value="2" /> | 301 <param name="col" value="2,3,4" /> |
| 306 <param name="end_column" value="4" /> | |
| 307 <param name="tol" value="0.5"/> | 302 <param name="tol" value="0.5"/> |
| 308 <param name="random_state" value="100"/> | 303 <param name="random_state" value="100"/> |
| 309 <output name="outfile" file="cluster_result14.txt"/> | 304 <output name="outfile" file="cluster_result14.txt"/> |
| 310 </test> | 305 </test> |
| 311 <test> | 306 <test> |
| 312 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | 307 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> |
| 313 <param name="selected_algorithm" value="MiniBatchKMeans"/> | 308 <param name="selected_algorithm" value="MiniBatchKMeans"/> |
| 314 <param name="selected_input_type" value="tabular"/> | 309 <param name="selected_input_type" value="tabular"/> |
| 315 <param name="n_init" value="5"/> | 310 <param name="n_init" value="5"/> |
| 316 <param name="start_column" value="2" /> | 311 <param name="col" value="2,3,4" /> |
| 317 <param name="end_column" value="4" /> | |
| 318 <param name="batch_size" value="10"/> | 312 <param name="batch_size" value="10"/> |
| 319 <param name="n_clusters" value="4"/> | 313 <param name="n_clusters" value="4"/> |
| 320 <param name="random_state" value="100"/> | 314 <param name="random_state" value="100"/> |
| 321 <param name="reassignment_ratio" value="1.0"/> | 315 <param name="reassignment_ratio" value="1.0"/> |
| 322 <output name="outfile" file="cluster_result15.txt"/> | 316 <output name="outfile" file="cluster_result15.txt"/> |
| 323 </test> | 317 </test> |
| 324 <test> | 318 <test> |
| 325 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | 319 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> |
| 326 <param name="selected_algorithm" value="KMeans"/> | 320 <param name="selected_algorithm" value="KMeans"/> |
| 327 <param name="selected_input_type" value="tabular"/> | 321 <param name="selected_input_type" value="tabular"/> |
| 328 <param name="start_column" value="1" /> | 322 <param name="col" value="1" /> |
| 329 <param name="end_column" value="1" /> | |
| 330 <param name="n_clusters" value="4" /> | 323 <param name="n_clusters" value="4" /> |
| 331 <param name="random_state" value="100"/> | 324 <param name="random_state" value="100"/> |
| 332 <output name="outfile" file="cluster_result16.txt"/> | 325 <output name="outfile" file="cluster_result16.txt"/> |
| 333 </test> | 326 </test> |
| 334 <test> | 327 <test> |
