Mercurial > repos > bgruening > sklearn_model_validation
comparison pca.py @ 27:376c88f35e0e draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 208a8d348e7c7a182cfbe1b6f17868146428a7e2"
| author | bgruening |
|---|---|
| date | Tue, 13 Apr 2021 21:18:23 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 26:4035b14da848 | 27:376c88f35e0e |
|---|---|
| 1 import argparse | |
| 2 | |
| 3 import numpy as np | |
| 4 from galaxy_ml.utils import read_columns | |
| 5 from sklearn.decomposition import IncrementalPCA, KernelPCA, PCA | |
| 6 | |
| 7 | |
| 8 def main(): | |
| 9 parser = argparse.ArgumentParser(description="RDKit screen") | |
| 10 parser.add_argument("-i", "--infile", help="Input file") | |
| 11 parser.add_argument( | |
| 12 "--header", action="store_true", help="Include the header row or skip it" | |
| 13 ) | |
| 14 parser.add_argument( | |
| 15 "-c", | |
| 16 "--columns", | |
| 17 type=str.lower, | |
| 18 default="all", | |
| 19 choices=[ | |
| 20 "by_index_number", | |
| 21 "all_but_by_index_number", | |
| 22 "by_header_name", | |
| 23 "all_but_by_header_name", | |
| 24 "all_columns", | |
| 25 ], | |
| 26 help="Choose to select all columns, or exclude/include some", | |
| 27 ) | |
| 28 parser.add_argument( | |
| 29 "-ci", | |
| 30 "--column_indices", | |
| 31 type=str.lower, | |
| 32 help="Choose to select all columns, or exclude/include some", | |
| 33 ) | |
| 34 parser.add_argument( | |
| 35 "-n", | |
| 36 "--number", | |
| 37 nargs="?", | |
| 38 type=int, | |
| 39 default=None, | |
| 40 help="Number of components to keep. If not set, all components are kept", | |
| 41 ) | |
| 42 parser.add_argument("--whiten", action="store_true", help="Whiten the components") | |
| 43 parser.add_argument( | |
| 44 "-t", | |
| 45 "--pca_type", | |
| 46 type=str.lower, | |
| 47 default="classical", | |
| 48 choices=["classical", "incremental", "kernel"], | |
| 49 help="Choose which flavour of PCA to use", | |
| 50 ) | |
| 51 parser.add_argument( | |
| 52 "-s", | |
| 53 "--svd_solver", | |
| 54 type=str.lower, | |
| 55 default="auto", | |
| 56 choices=["auto", "full", "arpack", "randomized"], | |
| 57 help="Choose the type of svd solver.", | |
| 58 ) | |
| 59 parser.add_argument( | |
| 60 "-b", | |
| 61 "--batch_size", | |
| 62 nargs="?", | |
| 63 type=int, | |
| 64 default=None, | |
| 65 help="The number of samples to use for each batch", | |
| 66 ) | |
| 67 parser.add_argument( | |
| 68 "-k", | |
| 69 "--kernel", | |
| 70 type=str.lower, | |
| 71 default="linear", | |
| 72 choices=["linear", "poly", "rbf", "sigmoid", "cosine", "precomputed"], | |
| 73 help="Choose the type of kernel.", | |
| 74 ) | |
| 75 parser.add_argument( | |
| 76 "-g", | |
| 77 "--gamma", | |
| 78 nargs="?", | |
| 79 type=float, | |
| 80 default=None, | |
| 81 help="Kernel coefficient for rbf, poly and sigmoid kernels. Ignored by other kernels", | |
| 82 ) | |
| 83 parser.add_argument( | |
| 84 "-tol", | |
| 85 "--tolerance", | |
| 86 type=float, | |
| 87 default=0.0, | |
| 88 help="Convergence tolerance for arpack. If 0, optimal value will be chosen by arpack", | |
| 89 ) | |
| 90 parser.add_argument( | |
| 91 "-mi", | |
| 92 "--max_iter", | |
| 93 nargs="?", | |
| 94 type=int, | |
| 95 default=None, | |
| 96 help="Maximum number of iterations for arpack", | |
| 97 ) | |
| 98 parser.add_argument( | |
| 99 "-d", | |
| 100 "--degree", | |
| 101 type=int, | |
| 102 default=3, | |
| 103 help="Degree for poly kernels. Ignored by other kernels", | |
| 104 ) | |
| 105 parser.add_argument( | |
| 106 "-cf", | |
| 107 "--coef0", | |
| 108 type=float, | |
| 109 default=1.0, | |
| 110 help="Independent term in poly and sigmoid kernels", | |
| 111 ) | |
| 112 parser.add_argument( | |
| 113 "-e", | |
| 114 "--eigen_solver", | |
| 115 type=str.lower, | |
| 116 default="auto", | |
| 117 choices=["auto", "dense", "arpack"], | |
| 118 help="Choose the type of eigen solver.", | |
| 119 ) | |
| 120 parser.add_argument( | |
| 121 "-o", "--outfile", help="Base name for output file (no extension)." | |
| 122 ) | |
| 123 args = parser.parse_args() | |
| 124 | |
| 125 usecols = None | |
| 126 pca_params = {} | |
| 127 | |
| 128 if args.columns == "by_index_number" or args.columns == "all_but_by_index_number": | |
| 129 usecols = [int(i) for i in args.column_indices.split(",")] | |
| 130 elif args.columns == "by_header_name" or args.columns == "all_but_by_header_name": | |
| 131 usecols = args.column_indices | |
| 132 | |
| 133 header = "infer" if args.header else None | |
| 134 | |
| 135 pca_input = read_columns( | |
| 136 f=args.infile, | |
| 137 c=usecols, | |
| 138 c_option=args.columns, | |
| 139 sep="\t", | |
| 140 header=header, | |
| 141 parse_dates=True, | |
| 142 encoding=None, | |
| 143 index_col=None, | |
| 144 ) | |
| 145 | |
| 146 pca_params.update({"n_components": args.number}) | |
| 147 | |
| 148 if args.pca_type == "classical": | |
| 149 pca_params.update({"svd_solver": args.svd_solver, "whiten": args.whiten}) | |
| 150 if args.svd_solver == "arpack": | |
| 151 pca_params.update({"tol": args.tolerance}) | |
| 152 pca = PCA() | |
| 153 | |
| 154 elif args.pca_type == "incremental": | |
| 155 pca_params.update({"batch_size": args.batch_size, "whiten": args.whiten}) | |
| 156 pca = IncrementalPCA() | |
| 157 | |
| 158 elif args.pca_type == "kernel": | |
| 159 pca_params.update( | |
| 160 { | |
| 161 "kernel": args.kernel, | |
| 162 "eigen_solver": args.eigen_solver, | |
| 163 "gamma": args.gamma, | |
| 164 } | |
| 165 ) | |
| 166 | |
| 167 if args.kernel == "poly": | |
| 168 pca_params.update({"degree": args.degree, "coef0": args.coef0}) | |
| 169 elif args.kernel == "sigmoid": | |
| 170 pca_params.update({"coef0": args.coef0}) | |
| 171 elif args.kernel == "precomputed": | |
| 172 pca_input = np.dot(pca_input, pca_input.T) | |
| 173 | |
| 174 if args.eigen_solver == "arpack": | |
| 175 pca_params.update({"tol": args.tolerance, "max_iter": args.max_iter}) | |
| 176 | |
| 177 pca = KernelPCA() | |
| 178 | |
| 179 print(pca_params) | |
| 180 pca.set_params(**pca_params) | |
| 181 pca_output = pca.fit_transform(pca_input) | |
| 182 np.savetxt(fname=args.outfile, X=pca_output, fmt="%.4f", delimiter="\t") | |
| 183 | |
| 184 | |
| 185 if __name__ == "__main__": | |
| 186 main() |
