Mercurial > repos > bgruening > scipy_sparse
comparison feature_selectors.py @ 29:5b8d4d35c605 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit c0a3a186966888e5787335a7628bf0a4382637e7
| author | bgruening |
|---|---|
| date | Tue, 14 May 2019 17:59:56 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 28:3d12f87c65ea | 29:5b8d4d35c605 |
|---|---|
| 1 """ | |
| 2 DyRFE | |
| 3 DyRFECV | |
| 4 MyPipeline | |
| 5 MyimbPipeline | |
| 6 check_feature_importances | |
| 7 """ | |
| 8 import numpy as np | |
| 9 | |
| 10 from imblearn import under_sampling, over_sampling, combine | |
| 11 from imblearn.pipeline import Pipeline as imbPipeline | |
| 12 from sklearn import (cluster, compose, decomposition, ensemble, | |
| 13 feature_extraction, feature_selection, | |
| 14 gaussian_process, kernel_approximation, | |
| 15 metrics, model_selection, naive_bayes, | |
| 16 neighbors, pipeline, preprocessing, | |
| 17 svm, linear_model, tree, discriminant_analysis) | |
| 18 | |
| 19 from sklearn.base import BaseEstimator | |
| 20 from sklearn.base import MetaEstimatorMixin, clone, is_classifier | |
| 21 from sklearn.feature_selection.rfe import _rfe_single_fit, RFE, RFECV | |
| 22 from sklearn.model_selection import check_cv | |
| 23 from sklearn.metrics.scorer import check_scoring | |
| 24 from sklearn.utils import check_X_y, safe_indexing, safe_sqr | |
| 25 from sklearn.utils._joblib import Parallel, delayed, effective_n_jobs | |
| 26 | |
| 27 | |
| 28 class DyRFE(RFE): | |
| 29 """ | |
| 30 Mainly used with DyRFECV | |
| 31 | |
| 32 Parameters | |
| 33 ---------- | |
| 34 estimator : object | |
| 35 A supervised learning estimator with a ``fit`` method that provides | |
| 36 information about feature importance either through a ``coef_`` | |
| 37 attribute or through a ``feature_importances_`` attribute. | |
| 38 n_features_to_select : int or None (default=None) | |
| 39 The number of features to select. If `None`, half of the features | |
| 40 are selected. | |
| 41 step : int, float or list, optional (default=1) | |
| 42 If greater than or equal to 1, then ``step`` corresponds to the | |
| 43 (integer) number of features to remove at each iteration. | |
| 44 If within (0.0, 1.0), then ``step`` corresponds to the percentage | |
| 45 (rounded down) of features to remove at each iteration. | |
| 46 If list, a series of steps of features to remove at each iteration. | |
| 47 Iterations stops when steps finish | |
| 48 verbose : int, (default=0) | |
| 49 Controls verbosity of output. | |
| 50 | |
| 51 """ | |
| 52 def __init__(self, estimator, n_features_to_select=None, step=1, | |
| 53 verbose=0): | |
| 54 super(DyRFE, self).__init__(estimator, n_features_to_select, | |
| 55 step, verbose) | |
| 56 | |
| 57 def _fit(self, X, y, step_score=None): | |
| 58 | |
| 59 if type(self.step) is not list: | |
| 60 return super(DyRFE, self)._fit(X, y, step_score) | |
| 61 | |
| 62 # dynamic step | |
| 63 X, y = check_X_y(X, y, "csc") | |
| 64 # Initialization | |
| 65 n_features = X.shape[1] | |
| 66 if self.n_features_to_select is None: | |
| 67 n_features_to_select = n_features // 2 | |
| 68 else: | |
| 69 n_features_to_select = self.n_features_to_select | |
| 70 | |
| 71 step = [] | |
| 72 for s in self.step: | |
| 73 if 0.0 < s < 1.0: | |
| 74 step.append(int(max(1, s * n_features))) | |
| 75 else: | |
| 76 step.append(int(s)) | |
| 77 if s <= 0: | |
| 78 raise ValueError("Step must be >0") | |
| 79 | |
| 80 support_ = np.ones(n_features, dtype=np.bool) | |
| 81 ranking_ = np.ones(n_features, dtype=np.int) | |
| 82 | |
| 83 if step_score: | |
| 84 self.scores_ = [] | |
| 85 | |
| 86 step_i = 0 | |
| 87 # Elimination | |
| 88 while np.sum(support_) > n_features_to_select and step_i < len(step): | |
| 89 | |
| 90 # if last step is 1, will keep loop | |
| 91 if step_i == len(step) - 1 and step[step_i] != 0: | |
| 92 step.append(step[step_i]) | |
| 93 | |
| 94 # Remaining features | |
| 95 features = np.arange(n_features)[support_] | |
| 96 | |
| 97 # Rank the remaining features | |
| 98 estimator = clone(self.estimator) | |
| 99 if self.verbose > 0: | |
| 100 print("Fitting estimator with %d features." % np.sum(support_)) | |
| 101 | |
| 102 estimator.fit(X[:, features], y) | |
| 103 | |
| 104 # Get coefs | |
| 105 if hasattr(estimator, 'coef_'): | |
| 106 coefs = estimator.coef_ | |
| 107 else: | |
| 108 coefs = getattr(estimator, 'feature_importances_', None) | |
| 109 if coefs is None: | |
| 110 raise RuntimeError('The classifier does not expose ' | |
| 111 '"coef_" or "feature_importances_" ' | |
| 112 'attributes') | |
| 113 | |
| 114 # Get ranks | |
| 115 if coefs.ndim > 1: | |
| 116 ranks = np.argsort(safe_sqr(coefs).sum(axis=0)) | |
| 117 else: | |
| 118 ranks = np.argsort(safe_sqr(coefs)) | |
| 119 | |
| 120 # for sparse case ranks is matrix | |
| 121 ranks = np.ravel(ranks) | |
| 122 | |
| 123 # Eliminate the worse features | |
| 124 threshold =\ | |
| 125 min(step[step_i], np.sum(support_) - n_features_to_select) | |
| 126 | |
| 127 # Compute step score on the previous selection iteration | |
| 128 # because 'estimator' must use features | |
| 129 # that have not been eliminated yet | |
| 130 if step_score: | |
| 131 self.scores_.append(step_score(estimator, features)) | |
| 132 support_[features[ranks][:threshold]] = False | |
| 133 ranking_[np.logical_not(support_)] += 1 | |
| 134 | |
| 135 step_i += 1 | |
| 136 | |
| 137 # Set final attributes | |
| 138 features = np.arange(n_features)[support_] | |
| 139 self.estimator_ = clone(self.estimator) | |
| 140 self.estimator_.fit(X[:, features], y) | |
| 141 | |
| 142 # Compute step score when only n_features_to_select features left | |
| 143 if step_score: | |
| 144 self.scores_.append(step_score(self.estimator_, features)) | |
| 145 self.n_features_ = support_.sum() | |
| 146 self.support_ = support_ | |
| 147 self.ranking_ = ranking_ | |
| 148 | |
| 149 return self | |
| 150 | |
| 151 | |
| 152 class DyRFECV(RFECV, MetaEstimatorMixin): | |
| 153 """ | |
| 154 Compared with RFECV, DyRFECV offers flexiable `step` to eleminate | |
| 155 features, in the format of list, while RFECV supports only fixed number | |
| 156 of `step`. | |
| 157 | |
| 158 Parameters | |
| 159 ---------- | |
| 160 estimator : object | |
| 161 A supervised learning estimator with a ``fit`` method that provides | |
| 162 information about feature importance either through a ``coef_`` | |
| 163 attribute or through a ``feature_importances_`` attribute. | |
| 164 step : int or float, optional (default=1) | |
| 165 If greater than or equal to 1, then ``step`` corresponds to the | |
| 166 (integer) number of features to remove at each iteration. | |
| 167 If within (0.0, 1.0), then ``step`` corresponds to the percentage | |
| 168 (rounded down) of features to remove at each iteration. | |
| 169 If list, a series of step to remove at each iteration. iteration stopes | |
| 170 when finishing all steps | |
| 171 Note that the last iteration may remove fewer than ``step`` features in | |
| 172 order to reach ``min_features_to_select``. | |
| 173 min_features_to_select : int, (default=1) | |
| 174 The minimum number of features to be selected. This number of features | |
| 175 will always be scored, even if the difference between the original | |
| 176 feature count and ``min_features_to_select`` isn't divisible by | |
| 177 ``step``. | |
| 178 cv : int, cross-validation generator or an iterable, optional | |
| 179 Determines the cross-validation splitting strategy. | |
| 180 Possible inputs for cv are: | |
| 181 - None, to use the default 3-fold cross-validation, | |
| 182 - integer, to specify the number of folds. | |
| 183 - :term:`CV splitter`, | |
| 184 - An iterable yielding (train, test) splits as arrays of indices. | |
| 185 For integer/None inputs, if ``y`` is binary or multiclass, | |
| 186 :class:`sklearn.model_selection.StratifiedKFold` is used. If the | |
| 187 estimator is a classifier or if ``y`` is neither binary nor multiclass, | |
| 188 :class:`sklearn.model_selection.KFold` is used. | |
| 189 Refer :ref:`User Guide <cross_validation>` for the various | |
| 190 cross-validation strategies that can be used here. | |
| 191 .. versionchanged:: 0.20 | |
| 192 ``cv`` default value of None will change from 3-fold to 5-fold | |
| 193 in v0.22. | |
| 194 scoring : string, callable or None, optional, (default=None) | |
| 195 A string (see model evaluation documentation) or | |
| 196 a scorer callable object / function with signature | |
| 197 ``scorer(estimator, X, y)``. | |
| 198 verbose : int, (default=0) | |
| 199 Controls verbosity of output. | |
| 200 n_jobs : int or None, optional (default=None) | |
| 201 Number of cores to run in parallel while fitting across folds. | |
| 202 ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. | |
| 203 ``-1`` means using all processors. See :term:`Glossary <n_jobs>` | |
| 204 for more details. | |
| 205 """ | |
| 206 def __init__(self, estimator, step=1, min_features_to_select=1, cv='warn', | |
| 207 scoring=None, verbose=0, n_jobs=None): | |
| 208 super(DyRFECV, self).__init__( | |
| 209 estimator, step=step, | |
| 210 min_features_to_select=min_features_to_select, | |
| 211 cv=cv, scoring=scoring, verbose=verbose, | |
| 212 n_jobs=n_jobs) | |
| 213 | |
| 214 def fit(self, X, y, groups=None): | |
| 215 """Fit the RFE model and automatically tune the number of selected | |
| 216 features. | |
| 217 Parameters | |
| 218 ---------- | |
| 219 X : {array-like, sparse matrix}, shape = [n_samples, n_features] | |
| 220 Training vector, where `n_samples` is the number of samples and | |
| 221 `n_features` is the total number of features. | |
| 222 y : array-like, shape = [n_samples] | |
| 223 Target values (integers for classification, real numbers for | |
| 224 regression). | |
| 225 groups : array-like, shape = [n_samples], optional | |
| 226 Group labels for the samples used while splitting the dataset into | |
| 227 train/test set. | |
| 228 """ | |
| 229 if type(self.step) is not list: | |
| 230 return super(DyRFECV, self).fit(X, y, groups) | |
| 231 | |
| 232 X, y = check_X_y(X, y, "csr") | |
| 233 | |
| 234 # Initialization | |
| 235 cv = check_cv(self.cv, y, is_classifier(self.estimator)) | |
| 236 scorer = check_scoring(self.estimator, scoring=self.scoring) | |
| 237 n_features = X.shape[1] | |
| 238 | |
| 239 step = [] | |
| 240 for s in self.step: | |
| 241 if 0.0 < s < 1.0: | |
| 242 step.append(int(max(1, s * n_features))) | |
| 243 else: | |
| 244 step.append(int(s)) | |
| 245 if s <= 0: | |
| 246 raise ValueError("Step must be >0") | |
| 247 | |
| 248 # Build an RFE object, which will evaluate and score each possible | |
| 249 # feature count, down to self.min_features_to_select | |
| 250 rfe = DyRFE(estimator=self.estimator, | |
| 251 n_features_to_select=self.min_features_to_select, | |
| 252 step=self.step, verbose=self.verbose) | |
| 253 | |
| 254 # Determine the number of subsets of features by fitting across | |
| 255 # the train folds and choosing the "features_to_select" parameter | |
| 256 # that gives the least averaged error across all folds. | |
| 257 | |
| 258 # Note that joblib raises a non-picklable error for bound methods | |
| 259 # even if n_jobs is set to 1 with the default multiprocessing | |
| 260 # backend. | |
| 261 # This branching is done so that to | |
| 262 # make sure that user code that sets n_jobs to 1 | |
| 263 # and provides bound methods as scorers is not broken with the | |
| 264 # addition of n_jobs parameter in version 0.18. | |
| 265 | |
| 266 if effective_n_jobs(self.n_jobs) == 1: | |
| 267 parallel, func = list, _rfe_single_fit | |
| 268 else: | |
| 269 parallel = Parallel(n_jobs=self.n_jobs) | |
| 270 func = delayed(_rfe_single_fit) | |
| 271 | |
| 272 scores = parallel( | |
| 273 func(rfe, self.estimator, X, y, train, test, scorer) | |
| 274 for train, test in cv.split(X, y, groups)) | |
| 275 | |
| 276 scores = np.sum(scores, axis=0) | |
| 277 diff = int(scores.shape[0]) - len(step) | |
| 278 if diff > 0: | |
| 279 step = np.r_[step, [step[-1]] * diff] | |
| 280 scores_rev = scores[::-1] | |
| 281 argmax_idx = len(scores) - np.argmax(scores_rev) - 1 | |
| 282 n_features_to_select = max( | |
| 283 n_features - sum(step[:argmax_idx]), | |
| 284 self.min_features_to_select) | |
| 285 | |
| 286 # Re-execute an elimination with best_k over the whole set | |
| 287 rfe = DyRFE(estimator=self.estimator, | |
| 288 n_features_to_select=n_features_to_select, step=self.step, | |
| 289 verbose=self.verbose) | |
| 290 | |
| 291 rfe.fit(X, y) | |
| 292 | |
| 293 # Set final attributes | |
| 294 self.support_ = rfe.support_ | |
| 295 self.n_features_ = rfe.n_features_ | |
| 296 self.ranking_ = rfe.ranking_ | |
| 297 self.estimator_ = clone(self.estimator) | |
| 298 self.estimator_.fit(self.transform(X), y) | |
| 299 | |
| 300 # Fixing a normalization error, n is equal to get_n_splits(X, y) - 1 | |
| 301 # here, the scores are normalized by get_n_splits(X, y) | |
| 302 self.grid_scores_ = scores[::-1] / cv.get_n_splits(X, y, groups) | |
| 303 return self | |
| 304 | |
| 305 | |
| 306 class MyPipeline(pipeline.Pipeline): | |
| 307 """ | |
| 308 Extend pipeline object to have feature_importances_ attribute | |
| 309 """ | |
| 310 def fit(self, X, y=None, **fit_params): | |
| 311 super(MyPipeline, self).fit(X, y, **fit_params) | |
| 312 estimator = self.steps[-1][-1] | |
| 313 if hasattr(estimator, 'coef_'): | |
| 314 coefs = estimator.coef_ | |
| 315 else: | |
| 316 coefs = getattr(estimator, 'feature_importances_', None) | |
| 317 if coefs is None: | |
| 318 raise RuntimeError('The estimator in the pipeline does not expose ' | |
| 319 '"coef_" or "feature_importances_" ' | |
| 320 'attributes') | |
| 321 self.feature_importances_ = coefs | |
| 322 return self | |
| 323 | |
| 324 | |
| 325 class MyimbPipeline(imbPipeline): | |
| 326 """ | |
| 327 Extend imblance pipeline object to have feature_importances_ attribute | |
| 328 """ | |
| 329 def fit(self, X, y=None, **fit_params): | |
| 330 super(MyimbPipeline, self).fit(X, y, **fit_params) | |
| 331 estimator = self.steps[-1][-1] | |
| 332 if hasattr(estimator, 'coef_'): | |
| 333 coefs = estimator.coef_ | |
| 334 else: | |
| 335 coefs = getattr(estimator, 'feature_importances_', None) | |
| 336 if coefs is None: | |
| 337 raise RuntimeError('The estimator in the pipeline does not expose ' | |
| 338 '"coef_" or "feature_importances_" ' | |
| 339 'attributes') | |
| 340 self.feature_importances_ = coefs | |
| 341 return self | |
| 342 | |
| 343 | |
| 344 def check_feature_importances(estimator): | |
| 345 """ | |
| 346 For pipeline object which has no feature_importances_ property, | |
| 347 this function returns the same comfigured pipeline object with | |
| 348 attached the last estimator's feature_importances_. | |
| 349 """ | |
| 350 if estimator.__class__.__module__ == 'sklearn.pipeline': | |
| 351 pipeline_steps = estimator.get_params()['steps'] | |
| 352 estimator = MyPipeline(pipeline_steps) | |
| 353 elif estimator.__class__.__module__ == 'imblearn.pipeline': | |
| 354 pipeline_steps = estimator.get_params()['steps'] | |
| 355 estimator = MyimbPipeline(pipeline_steps) | |
| 356 else: | |
| 357 return estimator |
