Source code for mriqc.classifier.sklearn_extension

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author: oesteban
# @Date:   2015-11-19 16:44:27
# @Last Modified by:   oesteban
# @Last Modified time: 2017-03-09 17:17:08

"""
=====================
Extensions to sklearn
=====================


Extends sklearn's GridSearchCV to a model search object


"""
from __future__ import absolute_import, division, print_function, unicode_literals
import warnings
import numbers
import time

from functools import partial, reduce
from itertools import product
from collections import Mapping, Sized
import operator
import numpy as np

from sklearn.base import is_classifier, clone
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection._split import check_cv
from sklearn.model_selection._search import (
    BaseSearchCV, check_scoring, indexable,
    Parallel, delayed, defaultdict, rankdata
)

from sklearn.utils.fixes import np_version

from sklearn.model_selection._validation import (
    _score, _num_samples, _index_param_value, _safe_split,
    FitFailedWarning, logger)

from mriqc import logging
from builtins import object, zip

if np_version < (1, 12, 0):
    from sklearn.utils.fixes import MaskedArray
else:
    from numpy.ma import MaskedArray

LOG = logging.getLogger('mriqc.classifier')

def _len(indict):
    product = partial(reduce, operator.mul)
    return sum(product(len(v) for v in p.values()) if p else 1
               for p in indict)

[docs]class ModelParameterGrid(object): """ Grid of models and parameters with a discrete number of values for each. Can be used to iterate over parameter value combinations with the Python built-in function iter. Read more in the :ref:`User Guide <search>`. Parameters ---------- param_grid : dict of string to sequence, or sequence of such The parameter grid to explore, as a dictionary mapping estimator parameters to sequences of allowed values. An empty dict signifies default parameters. A sequence of dicts signifies a sequence of grids to search, and is useful to avoid exploring parameter combinations that make no sense or have no effect. See the examples below. Examples -------- >>> from mriqc.classifier.model_selection import ModelParameterGrid >>> param_grid = {'model1': [{'a': [1, 2], 'b': [True, False]}], 'model2': [{'a': [0]}]} >>> len(ModelParameterGrid(param_grid)) == 5 True >>> list(ModelParameterGrid(param_grid)) == ( ... [{'a': 1, 'b': True}, {'a': 1, 'b': False}, ... {'a': 2, 'b': True}, {'a': 2, 'b': False}]) True >>> grid = [{'kernel': ['linear']}, {'kernel': ['rbf'], 'gamma': [1, 10]}] >>> list(ModelParameterGrid(param_grid)) == [('model2', {'a': 0}), ... ('model1', {'a': 1, 'b': True}), ... ('model1', {'a': 1, 'b': False}), ... ('model1', {'a': 2, 'b': True}), ... ('model1', {'a': 2, 'b': False})] True >>> ModelParameterGrid(param_grid)[1] == ('model1', {'a': 1, 'b': True}) True See also -------- :class:`ModelAndGridSearchCV`: Uses :class:`ModelParameterGrid` to perform a full parallelized parameter search. """ def __init__(self, param_grid): if isinstance(param_grid, Mapping): # wrap dictionary in a singleton list to support either dict # or list of dicts param_grid = [param_grid] self.param_grid = param_grid def __iter__(self): """Iterate over the points in the grid. Returns ------- params : iterator over dict of string to any Yields dictionaries mapping each estimator parameter to one of its allowed values. """ for p in self.param_grid: # Always sort the keys of a dictionary, for reproducibility items = list(p.items()) if not items: yield {} else: for estimator, grid_list in items: for grid in grid_list: grid_points = sorted(list(grid.items())) keys, values = zip(*grid_points) for v in product(*values): params = dict(zip(keys, v)) yield (estimator, params) def __len__(self): """Number of points on the grid.""" # Product function that can handle iterables (np.product can't). return sum(_len(points) for p in self.param_grid for estim, points in list(p.items())) def __getitem__(self, ind): """Get the parameters that would be ``ind``th in iteration Parameters ---------- ind : int The iteration index Returns ------- params : dict of string to any Equal to list(self)[ind] """ # This is used to make discrete sampling without replacement memory # efficient. for sub_grid in self.param_grid: for estimator, e_grid_list in list(sub_grid.items()): for e_grid in e_grid_list: if not e_grid: if ind == 0: return (estimator, {}) else: ind -= 1 continue # Reverse so most frequent cycling parameter comes first keys, values_lists = zip(*sorted(e_grid.items())[::-1]) sizes = [len(v_list) for v_list in values_lists] total = np.product(sizes) if ind >= total: # Try the next grid ind -= total else: out = {} for key, v_list, n in zip(keys, values_lists, sizes): ind, offset = divmod(ind, n) out[key] = v_list[offset] return (estimator, out) raise IndexError('ModelParameterGrid index out of range')
[docs]class RobustGridSearchCV(GridSearchCV): def _fit(self, X, y, groups, parameter_iterable): """Actual fitting, performing the search over parameters.""" estimator = self.estimator cv = check_cv(self.cv, y, classifier=is_classifier(estimator)) self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) X, y, groups = indexable(X, y, groups) n_splits = cv.get_n_splits(X, y, groups) if self.verbose > 0 and isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(n_splits, n_candidates, n_candidates * n_splits)) base_estimator = clone(self.estimator) pre_dispatch = self.pre_dispatch cv_iter = list(cv.split(X, y, groups)) out = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch )(delayed(_robust_fit_and_score)(clone(base_estimator), X, y, self.scorer_, train, test, self.verbose, parameters, fit_params=self.fit_params, return_train_score=self.return_train_score, return_n_test_samples=True, return_times=True, return_parameters=True, error_score=self.error_score) for parameters in parameter_iterable for train, test in cv_iter) # if one choose to see train score, "out" will contain train score info if self.return_train_score: (train_scores, test_scores, test_sample_counts, fit_time, score_time, parameters) = zip(*out) else: (test_scores, test_sample_counts, fit_time, score_time, parameters) = zip(*out) candidate_params = parameters[::n_splits] n_candidates = len(candidate_params) results = dict() def _store(key_name, array, weights=None, splits=False, rank=False): """A small helper to store the scores/times to the cv_results_""" array = np.array(array, dtype=np.float64).reshape(n_candidates, n_splits) if splits: for split_i in range(n_splits): results["split%d_%s" % (split_i, key_name)] = array[:, split_i] array_means = np.average(array, axis=1, weights=weights) results['mean_%s' % key_name] = array_means # Weighted std is not directly available in numpy array_stds = np.sqrt(np.average((array - array_means[:, np.newaxis]) ** 2, axis=1, weights=weights)) results['std_%s' % key_name] = array_stds if rank: results["rank_%s" % key_name] = np.asarray( rankdata(-array_means, method='min'), dtype=np.int32) # Computed the (weighted) mean and std for test scores alone # NOTE test_sample counts (weights) remain the same for all candidates test_sample_counts = np.array(test_sample_counts[:n_splits], dtype=np.int) _store('test_score', test_scores, splits=True, rank=True, weights=test_sample_counts if self.iid else None) if self.return_train_score: _store('train_score', train_scores, splits=True) _store('fit_time', fit_time) _store('score_time', score_time) best_index = np.flatnonzero(results["rank_test_score"] == 1)[0] best_parameters = candidate_params[best_index] # Use one MaskedArray and mask all the places where the param is not # applicable for that candidate. Use defaultdict as each candidate may # not contain all the params param_results = defaultdict(partial(MaskedArray, np.empty(n_candidates,), mask=True, dtype=object)) for cand_i, params in enumerate(candidate_params): for name, value in params.items(): # An all masked empty array gets created for the key # `"param_%s" % name` at the first occurence of `name`. # Setting the value at an index also unmasks that index param_results["param_%s" % name][cand_i] = value results.update(param_results) # Store a list of param dicts at the key 'params' results['params'] = candidate_params self.cv_results_ = results self.best_index_ = best_index self.n_splits_ = n_splits if self.refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best_parameters) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self
[docs]class ModelAndGridSearchCV(BaseSearchCV): """ Adds model selection to the GridSearchCV """ def __init__(self, param_grid, scoring=None, fit_params=None, n_jobs=1, iid=True, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', error_score='raise', return_train_score=True): super(ModelAndGridSearchCV, self).__init__( estimator=None, scoring=scoring, fit_params=fit_params, n_jobs=n_jobs, iid=iid, refit=refit, cv=cv, verbose=verbose, pre_dispatch=pre_dispatch, error_score=error_score, return_train_score=return_train_score) self.param_grid = param_grid self.best_model_ = None # _check_param_grid(param_grid)
[docs] def fit(self, X, y=None, groups=None): """ Run fit with all sets of parameters. """ return self._fit(X, y, groups, ModelParameterGrid(self.param_grid))
def _fit(self, X, y, groups, parameter_iterable): """Actual fitting, performing the search over parameters.""" X, y, groups = indexable(X, y, groups) cv = check_cv(self.cv, y, classifier=True) n_splits = cv.get_n_splits(X, y, groups) if self.verbose > 0 and isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) LOG.info("Fitting %d folds for each of %d candidates, totalling" " %d fits", n_splits, n_candidates, n_candidates * n_splits) pre_dispatch = self.pre_dispatch cv_iter = list(cv.split(X, y, groups)) out = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch )(delayed(_model_fit_and_score)( estimator, X, y, self.scoring, train, test, self.verbose, parameters, fit_params=self.fit_params, return_train_score=self.return_train_score, return_n_test_samples=True, return_times=True, return_parameters=True, error_score=self.error_score) for estimator, parameters in parameter_iterable for train, test in cv_iter) # if one choose to see train score, "out" will contain train score info if self.return_train_score: (train_scores, test_scores, test_sample_counts, fit_time, score_time, parameters) = zip(*out) else: (test_scores, test_sample_counts, fit_time, score_time, parameters) = zip(*out) candidate_params = parameters[::n_splits] n_candidates = len(candidate_params) results = dict() def _store(key_name, array, weights=None, splits=False, rank=False): """A small helper to store the scores/times to the cv_results_""" array = np.array(array, dtype=np.float64).reshape(n_candidates, n_splits) if splits: for split_i in range(n_splits): results["split%d_%s" % (split_i, key_name)] = array[:, split_i] array_means = np.average(array, axis=1, weights=weights) results['mean_%s' % key_name] = array_means # Weighted std is not directly available in numpy array_stds = np.sqrt(np.average((array - array_means[:, np.newaxis]) ** 2, axis=1, weights=weights)) results['std_%s' % key_name] = array_stds if rank: results["rank_%s" % key_name] = np.asarray( rankdata(-array_means, method='min'), dtype=np.int32) # Computed the (weighted) mean and std for test scores alone # NOTE test_sample counts (weights) remain the same for all candidates test_sample_counts = np.array(test_sample_counts[:n_splits], dtype=np.int) _store('test_score', test_scores, splits=True, rank=True, weights=test_sample_counts if self.iid else None) if self.return_train_score: _store('train_score', train_scores, splits=True) _store('fit_time', fit_time) _store('score_time', score_time) best_index = np.flatnonzero(results["rank_test_score"] == 1)[0] best_parameters = candidate_params[best_index][1] # Use one MaskedArray and mask all the places where the param is not # applicable for that candidate. Use defaultdict as each candidate may # not contain all the params param_results = defaultdict(partial(MaskedArray, np.empty(n_candidates,), mask=True, dtype=object)) for cand_i, params in enumerate(candidate_params): _, param_values = params for name, value in param_values.items(): # An all masked empty array gets created for the key # `"param_%s" % name` at the first occurence of `name`. # Setting the value at an index also unmasks that index param_results["param_%s" % name][cand_i] = value results.update(param_results) # Store a list of param dicts at the key 'params' results['params'] = candidate_params self.cv_results_ = results self.best_index_ = best_index self.n_splits_ = n_splits self.best_model_ = candidate_params[best_index] if self.refit: # build best estimator and fit best_estimator = _clf_build(self.best_model_[0]) best_estimator.set_params(**best_parameters) if y is not None: best_estimator.fit(X, y, **self.fit_params) else: best_estimator.fit(X, **self.fit_params) self.best_estimator_ = best_estimator return self
def _model_fit_and_score( estimator_str, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score=False, return_parameters=False, return_n_test_samples=False, return_times=False, error_score='raise'): """ """ if verbose > 1: msg = '[CV model=%s]' % estimator_str.upper() if parameters is not None: msg += ' %s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items())) LOG.info("%s %s", msg, (89 - len(msg)) * '.') estimator = _clf_build(estimator_str) # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, _index_param_value(X, v, train)) for k, v in fit_params.items()]) if parameters is not None: estimator.set_params(**parameters) start_time = time.time() X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) try: if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) except Exception as e: # Note fit time as time until error fit_time = time.time() - start_time score_time = 0.0 if error_score == 'raise': raise elif isinstance(error_score, numbers.Number): test_score = error_score if return_train_score: train_score = error_score warnings.warn("Classifier fit failed. The score on this train-test" " partition for these parameters will be set to %f. " "Details: \n%r" % (error_score, e), FitFailedWarning) else: raise ValueError("error_score must be the string 'raise' or a" " numeric value. (Hint: if using 'raise', please" " make sure that it has been spelled correctly.)") else: fit_time = time.time() - start_time scorer = check_scoring(estimator, scoring=scorer) test_score = _score(estimator, X_test, y_test, scorer) score_time = time.time() - start_time - fit_time if return_train_score: train_score = _score(estimator, X_train, y_train, scorer) if verbose > 2: msg += ", score=%f" % test_score if verbose > 1: total_time = score_time + fit_time end_msg = "%s, total=%s" % (msg, logger.short_format_time(total_time)) LOG.info(end_msg) ret = [train_score, test_score] if return_train_score else [test_score] if return_n_test_samples: ret.append(_num_samples(X_test)) if return_times: ret.extend([fit_time, score_time]) if return_parameters: ret.append((estimator_str, parameters)) return ret
[docs]def nested_fit_and_score( estimator, X, y, scorer, train, test, verbose=1, parameters=None, fit_params=None, return_train_score=False, return_times=False, error_score='raise'): """ """ from sklearn.externals.joblib.logger import short_format_time # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, _index_param_value(X, v, train)) for k, v in fit_params.items()]) if parameters is not None: estimator.set_params(**parameters) start_time = time.time() X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) if verbose > 1: LOG.info('CV iteration: Xtrain=%d, Ytrain=%d/%d -- Xtest=%d, Ytest=%d/%d.', len(X_train), len(X_train) - sum(y_train), sum(y_train), len(X_test), len(X_test) - sum(y_test), sum(y_test)) try: if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) except Exception as e: # Note fit time as time until error fit_time = time.time() - start_time score_time = 0.0 if error_score == 'raise': raise elif isinstance(error_score, numbers.Number): test_score = error_score if return_train_score: train_score = error_score LOG.warn("Classifier fit failed. The score on this train-test" " partition for these parameters will be set to %f. " "Details: \n%r", error_score, e) else: raise ValueError("error_score must be the string 'raise' or a" " numeric value. (Hint: if using 'raise', please" " make sure that it has been spelled correctly.)") else: fit_time = time.time() - start_time test_score = None score_time = 0.0 if len(set(y_test)) > 1: test_score = _score(estimator, X_test, y_test, scorer) score_time = time.time() - start_time - fit_time else: LOG.warn('Test set has no positive labels, scoring has been skipped ' 'in this loop.') if return_train_score: train_score = _score(estimator, X_train, y_train, scorer) acc_score = _score(estimator, X_test, y_test, check_scoring(estimator, scoring='accuracy')) if verbose > 0: total_time = score_time + fit_time if test_score is not None: LOG.info('Iteration took %s, score=%f, accuracy=%f.', short_format_time(total_time), test_score, acc_score) else: LOG.info('Iteration took %s, score=None, accuracy=%f.', short_format_time(total_time), acc_score) ret = { 'test': {'roc_auc': test_score, 'accuracy': acc_score} } if return_train_score: ret['train'] = {'roc_auc': train_score} if return_times: ret['times'] = [fit_time, score_time] return ret, estimator
def _robust_fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score=False, return_parameters=False, return_n_test_samples=False, return_times=False, error_score=0.5): """ """ if verbose > 1: if parameters is None: msg = '' else: msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items())) print("[CV] %s %s" % (msg, (64 - len(msg)) * '.')) # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, _index_param_value(X, v, train)) for k, v in fit_params.items()]) if parameters is not None: estimator.set_params(**parameters) start_time = time.time() X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) try: if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) except Exception as e: # Note fit time as time until error fit_time = time.time() - start_time score_time = 0.0 if error_score == 'raise': raise elif isinstance(error_score, numbers.Number): test_score = error_score if return_train_score: train_score = error_score warnings.warn("Classifier fit failed. The score on this train-test" " partition for these parameters will be set to %f. " "Details: \n%r" % (error_score, e), FitFailedWarning) else: raise ValueError("error_score must be the string 'raise' or a" " numeric value. (Hint: if using 'raise', please" " make sure that it has been spelled correctly.)") else: fit_time = time.time() - start_time if len(set(y_test)) == 1: test_score = 0.5 else: test_score = _score(estimator, X_test, y_test, scorer) score_time = time.time() - start_time - fit_time if return_train_score: train_score = _score(estimator, X_train, y_train, scorer) if verbose > 2: msg += ", score=%f" % test_score if verbose > 1: total_time = score_time + fit_time end_msg = "%s, total=%s" % (msg, logger.short_format_time(total_time)) print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) ret = [train_score, test_score] if return_train_score else [test_score] if return_n_test_samples: ret.append(_num_samples(X_test)) if return_times: ret.extend([fit_time, score_time]) if return_parameters: ret.append(parameters) return ret def _clf_build(clf_type): from sklearn import svm from sklearn.ensemble import RandomForestClassifier as RFC if clf_type == 'svc_linear': return svm.LinearSVC(C=1) elif clf_type == 'svc_rbf': return svm.SVC(C=1) elif clf_type == 'rfc': return RFC()