Mercurial > repos > bgruening > sklearn_generalized_linear
comparison model_validations.py @ 24:d51beacdc6c6 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit c0a3a186966888e5787335a7628bf0a4382637e7
| author | bgruening | 
|---|---|
| date | Tue, 14 May 2019 17:57:01 -0400 | 
| parents | |
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| 23:a146c2dd1ac3 | 24:d51beacdc6c6 | 
|---|---|
| 1 """ | |
| 2 class | |
| 3 ----- | |
| 4 OrderedKFold | |
| 5 RepeatedOrderedKold | |
| 6 | |
| 7 | |
| 8 function | |
| 9 -------- | |
| 10 train_test_split | |
| 11 """ | |
| 12 | |
| 13 import numpy as np | |
| 14 import warnings | |
| 15 | |
| 16 from itertools import chain | |
| 17 from math import ceil, floor | |
| 18 from sklearn.model_selection import (GroupShuffleSplit, ShuffleSplit, | |
| 19 StratifiedShuffleSplit) | |
| 20 from sklearn.model_selection._split import _BaseKFold, _RepeatedSplits | |
| 21 from sklearn.utils import check_random_state, indexable, safe_indexing | |
| 22 from sklearn.utils.validation import _num_samples, check_array | |
| 23 | |
| 24 | |
| 25 def _validate_shuffle_split(n_samples, test_size, train_size, | |
| 26 default_test_size=None): | |
| 27 """ | |
| 28 Validation helper to check if the test/test sizes are meaningful wrt to the | |
| 29 size of the data (n_samples) | |
| 30 """ | |
| 31 if test_size is None and train_size is None: | |
| 32 test_size = default_test_size | |
| 33 | |
| 34 test_size_type = np.asarray(test_size).dtype.kind | |
| 35 train_size_type = np.asarray(train_size).dtype.kind | |
| 36 | |
| 37 if (test_size_type == 'i' and (test_size >= n_samples or test_size <= 0) | |
| 38 or test_size_type == 'f' and (test_size <= 0 or test_size >= 1)): | |
| 39 raise ValueError('test_size={0} should be either positive and smaller' | |
| 40 ' than the number of samples {1} or a float in the ' | |
| 41 '(0, 1) range'.format(test_size, n_samples)) | |
| 42 | |
| 43 if (train_size_type == 'i' and (train_size >= n_samples or train_size <= 0) | |
| 44 or train_size_type == 'f' and (train_size <= 0 or train_size >= 1)): | |
| 45 raise ValueError('train_size={0} should be either positive and smaller' | |
| 46 ' than the number of samples {1} or a float in the ' | |
| 47 '(0, 1) range'.format(train_size, n_samples)) | |
| 48 | |
| 49 if train_size is not None and train_size_type not in ('i', 'f'): | |
| 50 raise ValueError("Invalid value for train_size: {}".format(train_size)) | |
| 51 if test_size is not None and test_size_type not in ('i', 'f'): | |
| 52 raise ValueError("Invalid value for test_size: {}".format(test_size)) | |
| 53 | |
| 54 if (train_size_type == 'f' and test_size_type == 'f' and | |
| 55 train_size + test_size > 1): | |
| 56 raise ValueError( | |
| 57 'The sum of test_size and train_size = {}, should be in the (0, 1)' | |
| 58 ' range. Reduce test_size and/or train_size.' | |
| 59 .format(train_size + test_size)) | |
| 60 | |
| 61 if test_size_type == 'f': | |
| 62 n_test = ceil(test_size * n_samples) | |
| 63 elif test_size_type == 'i': | |
| 64 n_test = float(test_size) | |
| 65 | |
| 66 if train_size_type == 'f': | |
| 67 n_train = floor(train_size * n_samples) | |
| 68 elif train_size_type == 'i': | |
| 69 n_train = float(train_size) | |
| 70 | |
| 71 if train_size is None: | |
| 72 n_train = n_samples - n_test | |
| 73 elif test_size is None: | |
| 74 n_test = n_samples - n_train | |
| 75 | |
| 76 if n_train + n_test > n_samples: | |
| 77 raise ValueError('The sum of train_size and test_size = %d, ' | |
| 78 'should be smaller than the number of ' | |
| 79 'samples %d. Reduce test_size and/or ' | |
| 80 'train_size.' % (n_train + n_test, n_samples)) | |
| 81 | |
| 82 n_train, n_test = int(n_train), int(n_test) | |
| 83 | |
| 84 if n_train == 0: | |
| 85 raise ValueError( | |
| 86 'With n_samples={}, test_size={} and train_size={}, the ' | |
| 87 'resulting train set will be empty. Adjust any of the ' | |
| 88 'aforementioned parameters.'.format(n_samples, test_size, | |
| 89 train_size) | |
| 90 ) | |
| 91 | |
| 92 return n_train, n_test | |
| 93 | |
| 94 | |
| 95 def train_test_split(*arrays, **options): | |
| 96 """Extend sklearn.model_selection.train_test_slit to have group split. | |
| 97 | |
| 98 Parameters | |
| 99 ---------- | |
| 100 *arrays : sequence of indexables with same length / shape[0] | |
| 101 Allowed inputs are lists, numpy arrays, scipy-sparse | |
| 102 matrices or pandas dataframes. | |
| 103 | |
| 104 test_size : float, int or None, optional (default=None) | |
| 105 If float, should be between 0.0 and 1.0 and represent the proportion | |
| 106 of the dataset to include in the test split. If int, represents the | |
| 107 absolute number of test samples. If None, the value is set to the | |
| 108 complement of the train size. If ``train_size`` is also None, it will | |
| 109 be set to 0.25. | |
| 110 | |
| 111 train_size : float, int, or None, (default=None) | |
| 112 If float, should be between 0.0 and 1.0 and represent the | |
| 113 proportion of the dataset to include in the train split. If | |
| 114 int, represents the absolute number of train samples. If None, | |
| 115 the value is automatically set to the complement of the test size. | |
| 116 | |
| 117 random_state : int, RandomState instance or None, optional (default=None) | |
| 118 If int, random_state is the seed used by the random number generator; | |
| 119 If RandomState instance, random_state is the random number generator; | |
| 120 If None, the random number generator is the RandomState instance used | |
| 121 by `np.random`. | |
| 122 | |
| 123 shuffle : None or str (default='simple') | |
| 124 How to shuffle the data before splitting. | |
| 125 None, no shuffle. | |
| 126 For str, one of 'simple', 'stratified' and 'group', corresponding to | |
| 127 `ShuffleSplit`, `StratifiedShuffleSplit` and `GroupShuffleSplit`, | |
| 128 respectively. | |
| 129 | |
| 130 labels : array-like or None (default=None) | |
| 131 Ignored if shuffle is None or 'simple'. | |
| 132 When shuffle='stratified', this array is used as class labels. | |
| 133 When shuffle='group', this array is used as groups. | |
| 134 | |
| 135 Returns | |
| 136 ------- | |
| 137 splitting : list, length=2 * len(arrays) | |
| 138 List containing train-test split of inputs. | |
| 139 | |
| 140 """ | |
| 141 n_arrays = len(arrays) | |
| 142 if n_arrays == 0: | |
| 143 raise ValueError("At least one array required as input") | |
| 144 test_size = options.pop('test_size', None) | |
| 145 train_size = options.pop('train_size', None) | |
| 146 random_state = options.pop('random_state', None) | |
| 147 shuffle = options.pop('shuffle', 'simple') | |
| 148 labels = options.pop('labels', None) | |
| 149 | |
| 150 if options: | |
| 151 raise TypeError("Invalid parameters passed: %s" % str(options)) | |
| 152 | |
| 153 arrays = indexable(*arrays) | |
| 154 | |
| 155 n_samples = _num_samples(arrays[0]) | |
| 156 if shuffle == 'group': | |
| 157 if labels is None: | |
| 158 raise ValueError("When shuffle='group', " | |
| 159 "labels should not be None!") | |
| 160 labels = check_array(labels, ensure_2d=False, dtype=None) | |
| 161 uniques = np.unique(labels) | |
| 162 n_samples = uniques.size | |
| 163 | |
| 164 n_train, n_test = _validate_shuffle_split(n_samples, test_size, train_size, | |
| 165 default_test_size=0.25) | |
| 166 | |
| 167 shuffle_options = dict(test_size=n_test, | |
| 168 train_size=n_train, | |
| 169 random_state=random_state) | |
| 170 | |
| 171 if shuffle is None: | |
| 172 if labels is not None: | |
| 173 warnings.warn("The `labels` is ignored for " | |
| 174 "shuffle being None!") | |
| 175 | |
| 176 train = np.arange(n_train) | |
| 177 test = np.arange(n_train, n_train + n_test) | |
| 178 | |
| 179 elif shuffle == 'simple': | |
| 180 if labels is not None: | |
| 181 warnings.warn("The `labels` is not needed and therefore " | |
| 182 "ignored for ShuffleSplit, as shuffle='simple'!") | |
| 183 | |
| 184 cv = ShuffleSplit(**shuffle_options) | |
| 185 train, test = next(cv.split(X=arrays[0], y=None)) | |
| 186 | |
| 187 elif shuffle == 'stratified': | |
| 188 cv = StratifiedShuffleSplit(**shuffle_options) | |
| 189 train, test = next(cv.split(X=arrays[0], y=labels)) | |
| 190 | |
| 191 elif shuffle == 'group': | |
| 192 cv = GroupShuffleSplit(**shuffle_options) | |
| 193 train, test = next(cv.split(X=arrays[0], y=None, groups=labels)) | |
| 194 | |
| 195 else: | |
| 196 raise ValueError("The argument `shuffle` only supports None, " | |
| 197 "'simple', 'stratified' and 'group', but got `%s`!" | |
| 198 % shuffle) | |
| 199 | |
| 200 return list(chain.from_iterable((safe_indexing(a, train), | |
| 201 safe_indexing(a, test)) for a in arrays)) | |
| 202 | |
| 203 | |
| 204 class OrderedKFold(_BaseKFold): | |
| 205 """ | |
| 206 Split into K fold based on ordered target value | |
| 207 | |
| 208 Parameters | |
| 209 ---------- | |
| 210 n_splits : int, default=3 | |
| 211 Number of folds. Must be at least 2. | |
| 212 shuffle: bool | |
| 213 random_state: None or int | |
| 214 """ | |
| 215 | |
| 216 def __init__(self, n_splits=3, shuffle=False, random_state=None): | |
| 217 super(OrderedKFold, self).__init__(n_splits, shuffle, random_state) | |
| 218 | |
| 219 def _iter_test_indices(self, X, y, groups=None): | |
| 220 n_samples = _num_samples(X) | |
| 221 n_splits = self.n_splits | |
| 222 y = np.asarray(y) | |
| 223 sorted_index = np.argsort(y) | |
| 224 if self.shuffle: | |
| 225 current = 0 | |
| 226 rng = check_random_state(self.random_state) | |
| 227 for i in range(n_samples // int(n_splits)): | |
| 228 start, stop = current, current + n_splits | |
| 229 rng.shuffle(sorted_index[start:stop]) | |
| 230 current = stop | |
| 231 rng.shuffle(sorted_index[current:]) | |
| 232 | |
| 233 for i in range(n_splits): | |
| 234 yield sorted_index[i:n_samples:n_splits] | |
| 235 | |
| 236 | |
| 237 class RepeatedOrderedKFold(_RepeatedSplits): | |
| 238 """ Repeated OrderedKFold runs mutiple times with different randomization. | |
| 239 | |
| 240 Parameters | |
| 241 ---------- | |
| 242 n_splits : int, default=5 | |
| 243 Number of folds. Must be at least 2. | |
| 244 | |
| 245 n_repeats : int, default=5 | |
| 246 Number of times cross-validator to be repeated. | |
| 247 | |
| 248 random_state: int, RandomState instance or None. Optional | |
| 249 """ | |
| 250 def __init__(self, n_splits=5, n_repeats=5, random_state=None): | |
| 251 super(RepeatedOrderedKFold, self).__init__( | |
| 252 OrderedKFold, n_repeats, random_state, n_splits=n_splits) | 
