diff numeric_clustering.xml @ 2:1d465f2ebfad draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/numeric_clustering commit adf077b912ddebd97b07b947b855cdd2862ed8ef
author bgruening
date Fri, 01 Jan 2016 12:58:17 -0500
parents d645cdee08ed
children 6bfbaf81b8f4
line wrap: on
line diff
--- a/numeric_clustering.xml	Fri Jan 01 10:42:52 2016 -0500
+++ b/numeric_clustering.xml	Fri Jan 01 12:58:17 2016 -0500
@@ -8,9 +8,9 @@
     </stdio>
     <macros>
         <token name="@VERSION@">0.9</token>
-        <macro name="n_clusters">
-            <param name="n_clusters" type="integer" optional="true" value="8" label="Number of clusters"
-                help="default value is 8 (--n_clusters)"/>
+        <macro name="n_clusters" token_default_value="8">
+            <param name="n_clusters" type="integer" optional="true" value="@DEFAULT_VALUE@" label="Number of clusters"
+                help="default value is @DEFAULT_VALUE@ (--n_clusters)"/>
         </macro>
         <macro name="n_init">
             <param name="n_init" type="integer" optional="true" value="" label="Number of runs with different centroid seeds"/>
@@ -62,37 +62,46 @@
     #set $json_string = json.dumps( $params )
 
     python "$cluster_script" '$json_string'
+    &&
+    cat "$cluster_script" >&2
+
 ]]>
     </command>
     <configfiles>
         <configfile name="cluster_script">
-<![CDATA[#!/usr/bin/env python
+<![CDATA[
 import sys
 import json
 import numpy as np
 import sklearn.cluster
 import pandas
 
-data = pandas.DataFrame.from_csv("$infile", sep='\t', header=0, index_col=0, parse_dates=True, encoding=None, tupleize_cols=False )
+data = pandas.read_csv("$infile", sep='\t', header=0, index_col=0, parse_dates=True, encoding=None, tupleize_cols=False )
 my_class = getattr(sklearn.cluster, "$algorithm_options.selected_algorithm")
 cluster_object = my_class()
 
 params = json.loads( sys.argv[1] )
 cluster_object.set_params(**params)
-if $end_column >= $start_column:
+#if $end_column and $start_column:
+
+if  $end_column >= $start_column:
     data_matrix = data.values[:, $start_column-1:$end_column]
 else:
     data_matrix = data.values
+
+#else:
+data_matrix = data.values
+#end if
 prediction = cluster_object.fit_predict( data_matrix )
-data['cluster_label'] = prediction
-data.to_csv(path_or_buf = "$outfile",sep="\t")
+data[len(data.columns)] = prediction
+data.to_csv(path_or_buf = "$outfile", sep="\t")
 ]]>
         </configfile>
     </configfiles>
     <inputs>
-        <param name="infile" type="data" format="tabular" label="Data file with numeric values"/>
-        <param name="start_column" label="Clustering column from" type="data_column" data_ref="infile" optional="True" />
-        <param name="end_column" label="to" type="data_column" data_ref="infile" optional="True" />
+        <param name="infile" type="data" format="tabular" label="Data file with numeric values" />
+        <param name="start_column" type="data_column" data_ref="infile" optional="True" label="Clustering column from" />
+        <param name="end_column" type="data_column" data_ref="infile" optional="True" label="to" />
         <conditional name="algorithm_options">
             <param name="selected_algorithm" type="select" label="Clustering Algorithm">
                 <option value="KMeans">KMeans</option>
@@ -105,7 +114,7 @@
                 <option value="MiniBatchKMeans">Mini Batch KMeans</option>
             </param>
             <when value="KMeans">
-                <expand macro="n_clusters"/>
+                <expand macro="n_clusters" default_label="8"/>
                 <expand macro="init"/>
                 <expand macro="n_init"/>
                 <expand macro="max_iter"/>
@@ -129,7 +138,7 @@
             <when value="Birch">
                 <param name="threshold" type="float" optional="true" value="0.5" label="Subcluster radius threshold"/>
                 <param name="branching_factor" type="integer" optional="true" value="50" label="Maximum number of subclusters per branch"/>
-                <expand macro="n_clusters"/> <!-- default to 3-->
+                <expand macro="n_clusters"  default_label="3" /> <!-- default to 3-->
                 <!--param name="compute_labels" type="boolean" optional="true" truevalue="true" falsevale="false" label="Compute labels for each fit"/-->
             </when>
             <when value="AffinityPropagation">
@@ -148,7 +157,7 @@
                 <param name="cluster_all" type="boolean" optional="true" truevalue="true" falsevale="false" label="Cluster all"/>
             </when>
             <when value="AgglomerativeClustering">
-                <expand macro="n_clusters"/> <!-- deafault 2-->
+                <expand macro="n_clusters"  default_label="2" /> <!-- deafault 2-->
                 <expand macro="affinity"/> <!--default = euclidean-->
                 <!--param name="memory" type="callable" optional="true" value="Memory(cachedir=None)" label="Caching path"/-->
                 <!--param name="connectivity" type="list array-like or callable" optional="true" value="None" label="Connectivity matrix"/-->
@@ -162,7 +171,7 @@
                 <!--param name="pooling_func" type="callable" optional="np.mean" value="None" label=""/-->
             </when>
             <when value="SpectralClustering">
-                <expand macro="n_clusters"/>
+                <expand macro="n_clusters" default_label="8" />
                 <param name="eigen_solver" type="select" value="arpack" label="Eigenvalue decomposition strategy">
                     <option value="arpack">arpack</option>
                     <option value="lobpcg">lobpcg</option>
@@ -184,7 +193,7 @@
                 <!--param name="kernel_params" type="dict" optional="true" value="None" label=""/-->
             </when>
             <when value="MiniBatchKMeans">
-                <expand macro="n_clusters"/>
+                <expand macro="n_clusters" default_label="8"/>
                 <expand macro="init"/>
                 <expand macro="n_init"/> <!-- default to 3-->
                 <expand macro="max_iter"/> <!--default to 100-->
@@ -205,6 +214,8 @@
         <test>
             <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
             <param name="selected_algorithm" value="KMeans"/>
+            <param name="start_column" value="2" />
+            <param name="end_column" value="4" />
             <param name="n_clusters" value="4" />
             <param name="init" value="k-means++" />
             <param name="random_state" value="100"/>
@@ -213,7 +224,9 @@
         <test>
             <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
             <param name="selected_algorithm" value="KMeans"/>
-            <param name="n_clusters" value="6" />
+            <param name="start_column" value="2" />
+            <param name="end_column" value="4" />
+            <param name="n_clusters" value="4" />
             <param name="init" value="random" />
             <param name="random_state" value="100"/>
             <output name="outfile" file="cluster_result02.txt"/>
@@ -221,6 +234,8 @@
         <test>
             <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
             <param name="selected_algorithm" value="DBSCAN"/>
+            <param name="start_column" value="2" />
+            <param name="end_column" value="4" />
             <param name="algorithm" value="kd_tree"/>
             <param name="leaf_size" value="10"/>
             <param name="eps" value="1.0"/>
@@ -229,19 +244,25 @@
         <test>
             <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
             <param name="selected_algorithm" value="Birch"/>
-            <param name="n_clusters" value="5"/>
+            <param name="start_column" value="2" />
+            <param name="end_column" value="4" />
+            <param name="n_clusters" value="4"/>
             <param name="threshold" value="0.008"/>
             <output name="outfile" file="cluster_result04.txt"/>
         </test>
         <test>
             <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
             <param name="selected_algorithm" value="Birch"/>
+            <param name="start_column" value="2" />
+            <param name="end_column" value="4" />
             <param name="branching_factor" value="20"/>
             <output name="outfile" file="cluster_result05.txt"/>
         </test>
         <test>
             <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
             <param name="selected_algorithm" value="AffinityPropagation"/>
+            <param name="start_column" value="2" />
+            <param name="end_column" value="4" />
             <param name="affinity" value="euclidean"/>
             <param name="copy" value="false"/>
             <output name="outfile" file="cluster_result06.txt"/>
@@ -249,24 +270,32 @@
         <test>
             <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
             <param name="selected_algorithm" value="AffinityPropagation"/>
+            <param name="start_column" value="2" />
+            <param name="end_column" value="4" />
             <param name="damping" value="0.8"/>
             <output name="outfile" file="cluster_result07.txt"/>
         </test>
         <test>
             <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
             <param name="selected_algorithm" value="MeanShift"/>
+            <param name="start_column" value="2" />
+            <param name="end_column" value="4" />
             <param name="min_bin_freq" value="3"/>
             <output name="outfile" file="cluster_result08.txt"/>
         </test>
         <test>
             <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
             <param name="selected_algorithm" value="MeanShift"/>
+            <param name="start_column" value="2" />
+            <param name="end_column" value="4" />
             <param name="cluster_all" value="False"/>
             <output name="outfile" file="cluster_result09.txt"/>
         </test>
         <test>
             <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
             <param name="selected_algorithm" value="AgglomerativeClustering"/>
+            <param name="start_column" value="2" />
+            <param name="end_column" value="4" />
             <param name="affinity" value="euclidean"/>
             <param name="linkage" value="average"/>
             <output name="outfile" file="cluster_result10.txt"/>
@@ -274,16 +303,20 @@
         <test>
             <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
             <param name="selected_algorithm" value="AgglomerativeClustering"/>
+            <param name="start_column" value="2" />
+            <param name="end_column" value="4" />
             <param name="linkage" value="complete"/>
-            <param name="n_clusters" value="5"/>
+            <param name="n_clusters" value="4"/>
             <output name="outfile" file="cluster_result11.txt"/>
         </test>
         <test>
             <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
             <param name="selected_algorithm" value="SpectralClustering"/>
+            <param name="start_column" value="2" />
+            <param name="end_column" value="4" />
             <param name="eigen_solver" value="arpack"/>
             <param name="n_neighbors" value="12"/>
-            <param name="n_clusters" value="7"/>
+            <param name="n_clusters" value="4"/>
             <param name="assign_labels" value="discretize"/>
             <param name="random_state" value="100"/>
             <output name="outfile" file="cluster_result12.txt"/>
@@ -291,6 +324,8 @@
         <test>
             <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
             <param name="selected_algorithm" value="SpectralClustering"/>
+            <param name="start_column" value="2" />
+            <param name="end_column" value="4" />
             <param name="assign_labels" value="discretize"/>
             <param name="random_state" value="100"/>
             <param name="degree" value="2"/>
@@ -299,6 +334,8 @@
         <test>
             <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
             <param name="selected_algorithm" value="MiniBatchKMeans"/>
+            <param name="start_column" value="2" />
+            <param name="end_column" value="4" />
             <param name="tol" value="0.5"/>
             <param name="random_state" value="100"/>
             <output name="outfile" file="cluster_result14.txt"/>
@@ -307,8 +344,10 @@
             <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
             <param name="selected_algorithm" value="MiniBatchKMeans"/>
             <param name="n_init" value="5"/>
+            <param name="start_column" value="2" />
+            <param name="end_column" value="4" />
             <param name="batch_size" value="10"/>
-            <param name="n_clusters" value="3"/>
+            <param name="n_clusters" value="4"/>
             <param name="random_state" value="100"/>
             <param name="reassignment_ratio" value="1.0"/>
             <output name="outfile" file="cluster_result15.txt"/>
@@ -316,10 +355,9 @@
         <test>
             <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
             <param name="selected_algorithm" value="KMeans"/>
-            <param name="start_column" value="3" />
-            <param name="end_column" value="4" />
-            <param name="n_clusters" value="6" />
-            <param name="init" value="random" />
+            <param name="start_column" value="1" />
+            <param name="end_column" value="1" />
+            <param name="n_clusters" value="4" />
             <param name="random_state" value="100"/>
             <output name="outfile" file="cluster_result16.txt"/>
         </test>