Source code for mriqc.classifier.cv

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author: oesteban
# @Date:   2015-11-19 16:44:27
# @Last Modified by:   oesteban
# @Last Modified time: 2017-03-07 19:39:20

"""

============================================================
:mod:`mriqc.classifier.cv` -- MRIQC Cross-validation Helpers
============================================================


"""
from __future__ import absolute_import, division, print_function, unicode_literals

import numpy as np
import pandas as pd

from mriqc import __version__, logging
from .data import read_iqms, read_dataset, zscore_dataset
from .sklearn_extension import ModelAndGridSearchCV, RobustGridSearchCV, nested_fit_and_score

from sklearn.base import is_classifier, clone
from sklearn.metrics.scorer import check_scoring
from sklearn.model_selection import LeavePGroupsOut, StratifiedKFold
from sklearn.model_selection._split import check_cv

from builtins import object, str

LOG = logging.getLogger('mriqc.classifier')

DEFAULT_TEST_PARAMETERS = {
    'svc_linear': [{'C': [0.1, 1]}],
}

EXCLUDE_COLUMNS = ['size_x', 'size_y', 'size_z', 'spacing_x', 'spacing_y', 'spacing_z']

[docs]class CVHelperBase(object): def __init__(self, X, Y, param=None, n_jobs=-1, site_label='site', rate_label='rater_1'): # Initialize some values self.param = DEFAULT_TEST_PARAMETERS.copy() if param is not None: self.param = param self.n_jobs = n_jobs self._rate_column = rate_label self._site_column = site_label self._Xtrain, self._ftnames = read_dataset(X, Y, rate_label=rate_label) self.sites = list(set(self._Xtrain[site_label].values.ravel())) @property def ftnames(self): return self._ftnames @property def rate_column(self): return self._rate_column
[docs] def fit(self): raise NotImplementedError
[docs] def predict_dataset(self, data, out_file=None): raise NotImplementedError
[docs] def predict(self, data): raise NotImplementedError
[docs] def get_groups(self): groups = list(self._Xtrain[[self._site_column]].values.ravel()) group_names = list(set(groups)) groups_idx = [] for g in groups: groups_idx.append(group_names.index(g)) return groups_idx
def _generate_sample(self, zscored=False): from sklearn.utils import indexable X = self._Xtr_zs.copy() if zscored else self._Xtrain.copy() sample_x = np.array([tuple(x) for x in X[self._ftnames].values]) labels_y = X[[self._rate_column]].values.ravel() return indexable(sample_x, labels_y, self.get_groups())
[docs]class NestedCVHelper(CVHelperBase): def __init__(self, X, Y, param=None, n_jobs=-1, site_label='site', rate_label='rater_1', task_id=None): super(NestedCVHelper, self).__init__(X, Y, param=param, n_jobs=n_jobs, site_label='site', rate_label='rater_1') self._Xtr_zs = zscore_dataset(self._Xtrain, njobs=n_jobs, excl_columns=[rate_label] + EXCLUDE_COLUMNS) self._models = [] self._best_clf = {} self._best_model = {} self._cv_inner = {'type': 'kfold', 'n_splits': 10} self._cv_outer = None self._cv_scores_df = None self._task_id = task_id @property def cv_scores_df(self): return self._cv_scores_df @property def cv_inner(self): return self._cv_inner @cv_inner.setter def cv_inner(self, value): self._cv_inner = value @property def cv_outer(self): return self._cv_outer @cv_outer.setter def cv_outer(self, value): self._cv_outer = value @property def best_clf(self): return self._best_clf @property def best_model(self): return self._best_model
[docs] def fit(self): LOG.info('Start fitting ...') gs_cv_params = {'n_jobs': self.n_jobs, 'cv': _cv_build(self.cv_inner), 'verbose': 0} zscore_cv_auc = [] zscore_cv_acc = [] split_id = 0 for dozs in [False, True]: LOG.info('Generate %sz-scored sample ...', '' if dozs else 'non ') X, y, groups = self._generate_sample(zscored=dozs) # The inner CV loop is a grid search on clf_params LOG.info('Creating ModelAndGridSearchCV') inner_cv = ModelAndGridSearchCV(self.param, **gs_cv_params) # Some sklearn's validations scoring = check_scoring(inner_cv, scoring='roc_auc') cv_outer = check_cv(_cv_build(self.cv_outer), y, classifier=is_classifier(inner_cv)) # Outer CV loop outer_cv_scores = [] outer_cv_acc = [] LOG.info('Starting nested cross-validation ...') for train, test in list(cv_outer.split(X, y, groups)): # Find the groups in the train set, in case inner CV is LOSO. fit_params = None if self.cv_inner.get('type') == 'loso': train_groups = [groups[i] for i in train] fit_params = {'groups': train_groups} result = nested_fit_and_score( clone(inner_cv), X, y, scoring, train, test, fit_params=fit_params, verbose=1) # Test group has no positive cases if result is None: continue score, clf = result test_group = list(set(groups[i] for i in test))[0] self._models.append({ # 'clf_type': clf_str, 'zscored': int(dozs), 'outer_split_id': split_id, 'left-out-sites': self.sites[test_group], 'best_model': clf.best_model_, 'best_params': clf.best_params_, 'best_score': clf.best_score_, 'best_index': clf.best_index_, 'cv_results': clf.cv_results_, 'cv_scores': score['test']['roc_auc'], 'cv_accuracy': score['test']['accuracy'], 'cv_params': clf.cv_results_['params'], 'cv_auc_means': clf.cv_results_['mean_test_score'], 'cv_splits': {'split%03d' % i: clf.cv_results_['split%d_test_score' % i] for i in list(range(clf.n_splits_))} }) # Store the outer loop scores if score['test']['roc_auc'] is not None: outer_cv_scores.append(score['test']['roc_auc']) outer_cv_acc.append(score['test']['accuracy']) split_id += 1 # LOG.info( # '[%s-%szs] Outer CV: roc_auc=%f, accuracy=%f, ' # 'Inner CV: best roc_auc=%f, params=%s. ', # clf.best_model_[0], 'n' if not dozs else '', # score['test']['roc_auc'] if score['test']['roc_auc'] is not None else -1.0, # score['test']['accuracy'], # clf.best_score_, clf.best_model_[1]) LOG.info('Outer CV loop finished, roc_auc=%f (+/-%f), accuracy=%f (+/-%f)', np.mean(outer_cv_scores), 2 * np.std(outer_cv_scores), np.mean(outer_cv_acc), 2 * np.std(outer_cv_acc)) zscore_cv_auc.append(outer_cv_scores) zscore_cv_acc.append(outer_cv_acc) # Select best performing model best_inner_loops = [model['best_score'] for model in self._models] best_idx = np.argmax(best_inner_loops) self._best_model = self._models[best_idx] LOG.info('Inner CV [%d models compared] - best model %s-%szs, score=%f, params=%s', len(best_inner_loops) * len(self._models[0]['cv_params']), self._best_model['best_model'][0], 'n' if not self._best_model['zscored'] else '', self._best_model['best_score'], self._best_model['best_params']) # Write out evaluation result best_zs = 1 if self._best_model['zscored'] else 0 LOG.info('CV - estimated performance: roc_auc=%f (+/-%f), accuracy=%f (+/-%f)', np.mean(zscore_cv_auc[best_zs]), 2 * np.std(zscore_cv_auc[best_zs]), np.mean(zscore_cv_acc[best_zs]), 2 * np.std(zscore_cv_acc[best_zs]), )
[docs] def get_inner_cv_scores(self): # Compose a dataframe object columns = ['split_id', 'zscored', 'clf', 'mean_auc', 'params'] cvdict = {col: [] for col in columns} cvdict.update({key: [] for key in self._models[0]['cv_splits'].keys()}) for model in self._models: for i, param in enumerate(model['cv_params']): cvdict['clf'] += [param[0]] cvdict['split_id'] += [model['outer_split_id']] cvdict['zscored'] += [int(model['zscored'])] cvdict['params'] += [param[1]] cvdict['mean_auc'] += [model['cv_auc_means'][i]] for key, val in list(model['cv_splits'].items()): cvdict[key] += [val[i]] # massage columns if self._task_id is not None: cvdict['task_id'] = [self._task_id] * len(cvdict['clf']) columns.insert(0, 'task_id') self._cv_scores_df = pd.DataFrame(cvdict)[columns] return self._cv_scores_df
[docs] def get_outer_cv_scores(self): # Compose a dataframe object columns = ['split_id', 'site', 'zscored', 'auc', 'acc'] cvdict = {col: [] for col in columns} for model in self._models: cvdict['zscored'] += [int(model['zscored'])] cvdict['split_id'] += [model['outer_split_id']] cvdict['site'] += [model['left-out-sites']] cvdict['auc'] += [model['cv_scores']] cvdict['acc'] += [model['cv_accuracy']] if self._task_id is not None: cvdict['task_id'] = [self._task_id] * len(cvdict['split_id']) columns.insert(0, 'task_id') return pd.DataFrame(cvdict)[columns]
[docs]class CVHelper(CVHelperBase): def __init__(self, X=None, Y=None, load_clf=None, param=None, n_jobs=-1, site_label='site', rate_label='rater_1', zscored=False): if (X is None or Y is None) and load_clf is None: raise RuntimeError('Either load_clf or X & Y should be supplied') self._estimator = None self._Xtest = None self._zscored = zscored self._pickled = False self._rate_column = rate_label if load_clf is not None: self.n_jobs = n_jobs self.load(load_clf) self._ftnames = getattr(self._estimator, '_ftnames') else: super(CVHelper, self).__init__( X, Y, param=param, n_jobs=n_jobs, site_label=site_label, rate_label=rate_label) if zscored: self._Xtrain = zscore_dataset(self._Xtrain, njobs=n_jobs, excl_columns=[rate_label] + EXCLUDE_COLUMNS) @property def estimator(self): return self._estimator @property def Xtest(self): return self._Xtest
[docs] def setXtest(self, X, Y): self._Xtest, _ = read_dataset(X, Y, rate_label=self._rate_column) if self._zscored: self._Xtest = zscore_dataset(self._Xtest, njobs=self.n_jobs, excl_columns=[self._rate_column] + EXCLUDE_COLUMNS)
[docs] def fit(self): from sklearn.ensemble import RandomForestClassifier as RFC if self._pickled: LOG.info('Classifier was loaded from file, cancelling fitting.') return LOG.info('Start fitting ...') estimator = RFC() grid = RobustGridSearchCV( estimator, self.param['rfc'], error_score=0.5, refit=True, scoring=check_scoring(estimator, scoring='roc_auc'), n_jobs=self.n_jobs, cv=LeavePGroupsOut(n_groups=1), verbose=0) X, y, groups = self._generate_sample() self._estimator = grid.fit(X, y, groups=groups) LOG.info('Model selection - best parameters (roc_auc=%f) %s', grid.best_score_, grid.best_params_)
[docs] def save(self, filehandler, compress=3): """ Pickle the estimator, adding the feature names http://scikit-learn.org/stable/modules/model_persistence.html """ from sklearn.externals.joblib import dump as savepkl # Store ftnames setattr(self._estimator, '_ftnames', self._ftnames) savepkl(self._estimator, filehandler, compress=compress)
[docs] def load(self, filehandler): """ UnPickle the estimator, adding the feature names http://scikit-learn.org/stable/modules/model_persistence.html """ from sklearn.externals.joblib import load as loadpkl self._estimator = loadpkl(filehandler) self._ftnames = getattr(self._estimator, '_ftnames') self._pickled = True
[docs] def predict(self, datapoints): return self.estimator.predict(datapoints).astype(int)
[docs] def predict_dataset(self, data, out_file=None): _xeval, _, bidts = read_iqms(data) sample_x = np.array([tuple(x) for x in _xeval[self._ftnames].values]) pred = _xeval[bidts].copy() pred['prediction'] = self.predict(sample_x).astype(int) if out_file is not None: pred[bidts + ['prediction']].to_csv(out_file, index=False) return pred
[docs] def evaluate(self, scoring='accuracy'): from sklearn.model_selection._validation import _score sample_x = np.array([tuple(x) for x in self._Xtest[self._ftnames].values]) return _score(self._estimator, sample_x, self._Xtest.rate.values.ravel().tolist(), check_scoring(self._estimator, scoring=scoring))
def _cv_build(cv_scheme): LOG.debug('Building CV scheme: %s', str(cv_scheme)) if cv_scheme is None: return None if cv_scheme is not None and cv_scheme.get('type', '') == 'kfold': nsplits = cv_scheme.get('n_splits', 6) return StratifiedKFold(n_splits=nsplits, shuffle=True) if cv_scheme is not None and cv_scheme.get('type', '') == 'loso': return LeavePGroupsOut(n_groups=1) raise RuntimeError('Unknown CV scheme (%s)' % str(cv_scheme))