Source code for operalib.ridge

"""
:mod:`operalib.ridge` implements Operator-Valued Kernel ridge
regression.
"""
# Author: Romain Brault <romain.brault@telecom-paristech.fr> with help from
#         the scikit-learn community.
#         Maxime Sangnier <maxime.sangnier@gmail.com>
# License: MIT

from scipy.optimize import minimize
from scipy.sparse.linalg import LinearOperator
from numpy import (reshape, eye, zeros, empty, dot, all, isnan, diag, number,
                   issubdtype)
from scipy.linalg import solve

from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.utils import check_array
from sklearn.utils.validation import check_is_fitted
from sklearn.metrics.pairwise import rbf_kernel

from control import dlyap

from .metrics import first_periodic_kernel
from .kernels import DecomposableKernel, RBFCurlFreeKernel
from .risk import OVKRidgeRisk
from .signal import get_period

# When adding a new kernel, update this table and the _get_kernel_map method
PAIRWISE_KERNEL_FUNCTIONS = {
    'DGauss': DecomposableKernel,
    'DPeriodic': DecomposableKernel,
    'CurlF': RBFCurlFreeKernel}


def _graph_Laplacian(similarities):
    return diag(similarities.sum(axis=1)) - similarities


class _SemisupLinop:

    def __init__(self, lbda2, is_sup, L, p):
        self.lbda2 = lbda2
        self.is_sup = is_sup.ravel()
        self.L = L
        self.p = p
        self.ns = is_sup.shape[0] - L.shape[0]
        self.ls = L.shape[0]

    def _dot_U(self, vec):
        mat = vec.reshape((self.ns + self.ls, self.p))
        res = empty((self.ns + self.ls, self.p))
        res[self.is_sup, :] = mat[self.is_sup, :]
        res[~self.is_sup, :] = self.lbda2 * dot(self.L, mat[~self.is_sup, :])

        return res.ravel()

    def _dot_JT(self, vec):
        mat = vec.reshape((self.ns + self.ls, self.p))
        res = empty((self.ns + self.ls, self.p))
        res[self.is_sup, :] = mat[self.is_sup, :]
        res[~self.is_sup, :] = 0

        return res.ravel()

    def gen(self):
        shape_U = ((self.ns + self.ls) * self.p, (self.ns + self.ls) * self.p)
        shape_JT = ((self.ns + self.ls) * self.p, (self.ns + self.ls) * self.p)

        # return U, JT
        return (LinearOperator(shape_U,
                               dtype=self.L.dtype,
                               matvec=lambda b: self._dot_U(b),
                               rmatvec=lambda b: self._dot_U(b)),
                LinearOperator(shape_JT,
                               dtype=self.L.dtype,
                               matvec=lambda b: self._dot_JT(b),
                               rmatvec=lambda b: self._dot_JT(b)))


[docs]class OVKDecomposableRidge(BaseEstimator, RegressorMixin): """Operator-Valued kernel ridge regression. Operator-Valued kernel ridge regression (OVKRR) combines ridge regression (linear least squares with l2-norm regularization) with the (OV)kernel trick. It learns a linear function in the space induced by the respective kernel and the data. For non-linear kernels, this corresponds to a non-linear function in the original space. This is a simplyfied version of OVKRidge Handelling only the decomposable kernel. This allows to rewrite the optimality condition as a sylvester system of equation and reduce the complexity to .. math:: \mathcal{O}(n^3) + \mathcal{O}(p^3) where n is the number of training points and p the number of outputs. Optimization problem solved for learning: .. math:: \min_{h \in \mathcal H}~ \frac{\lambda}{2} \|h\|_{\mathcal H}^2 + \frac{1}{np} \sum_{i=1}^n \|y_j - h(x_i)\|_{\mathcal Y}^2 + \frac{\lambda_m}{2} \sum_{i=1}^n W_{ij} \|h(x_i) - h(x_j)\|_{\mathcal Y}^2 Attributes ---------- dual_coef_ : array, shape = [n_samples x n_targest] Weight vector(s) in kernel space linop_ : callable Callable which associate to the training points X the Gram matrix (the Gram matrix being a LinearOperator) A_ : array, shape = [n_targets, n_targets] Set when Linear operator used by the decomposable kernel is default or None. period_ : float Set when period used by the First periodic kernel is 'autocorr'. See also -------- ovk DecomposableKernel ovk OVKRidge sklearn.OVKRidge Linear ridge regression. sklearn.KernelOVKRidge Kernel ridge regression. sklearn.SVR Support Vector Regression implemented using libsvm. Examples -------- >>> import operalib as ovk >>> import numpy as np >>> n_samples, n_features, n_targets = 10, 5, 5 >>> rng = np.random.RandomState(0) >>> y = rng.randn(n_samples, n_targets) >>> X = rng.randn(n_samples, n_features) >>> clf = ovk.OVKRidge('DGauss', lbda=1.0) >>> clf.fit(X, y) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS OVKRidge(...) """ def __init__(self, input_kernel='Gauss', A=None, lbda=1e-5, gamma=None, theta=0.7, period='autocorr', autocorr_params=None): """Initialize OVK ridge regression model. Parameters ---------- ovkernel : {string, callable}, default='DGauss' Kernel mapping used internally. A callable should accept two arguments, and should return a LinearOperator. lbda : {float}, default=1e-5 Small positive values of lbda improve the conditioning of the problem and reduce the variance of the estimates. Lbda corresponds to ``(2*C)^-1`` in other linear models such as LogisticRegression or LinearSVC. A : {LinearOperator, array-like, sparse matrix}, default=None Linear operator used by the decomposable kernel. If default is None, wich is set to identity matrix of size y.shape[1] when fitting. gamma : {float}, default=None. Gamma parameter for the Decomposable Gaussian kernel. Ignored by other kernels. theta : {float}, default=.7 Theta parameter for the Decomposable First Periodic kernel. Ignored by other kernels. period : {float}, default=default_period Period parameter for the First periodic kernel. If optional modules have been imported then default_period is 2 * pi. Otherwise it uses autocorrelation methods to determine the period. autocorr_params : {mapping of string to any} Additional parameters (keyword arguments) for the period detection for periodic kernels. If None, parameter choice is left to the period detection method. """ self.input_kernel = input_kernel self.A = A self.lbda = lbda self.theta = theta self.period = period self.autocorr_params = autocorr_params self.gamma = gamma def _validate_params(self): # check on self.ovkernel is performed in method __get_kernel if self.lbda < 0: raise ValueError('lbda must be positive') if self.gamma is not None: if self.gamma < 0: raise ValueError('sigma must be positive or default (None)') if self.theta < 0: raise ValueError('theta must be positive') if isinstance(self.period, (int, float)): if self.period < 0: raise ValueError('period must be positive') def _default_decomposable_op(self, y): if self.A is not None: return self.A elif y.ndim == 2: return eye(y.shape[1]) else: return eye(1) def _default_period(self, X, y): if self.period is 'autocorr': autocorr_params = self.autocorr_params or {} return get_period(X, y, **autocorr_params) elif isinstance(self.period, (int, float)): return self.period else: raise ValueError('period must be a positive number or a valid ' 'string') def _get_kernel_map(self, X, y): # When adding a new kernel, update this table and the _get_kernel_map # method if callable(self.input_kernel): ovkernel = self.input_kernel elif type(self.input_kernel) is str: # 1) check string and assign the right parameters if self.input_kernel == 'Gauss': self.A_ = self._default_decomposable_op(y) kernel_params = {'A': self.A_, 'scalar_kernel': rbf_kernel, 'scalar_kernel_params': {'gamma': self.gamma}} elif self.input_kernel == 'Periodic': self.A_ = self._default_decomposable_op(y) self.period_ = self._default_period(X, y) kernel_params = {'A': self.A_, 'scalar_kernel': first_periodic_kernel, 'scalar_kernel_params': {'gamma': self.theta, 'period': self.period_}, } else: raise NotImplementedError('unsupported kernel') # 2) Uses lookup table to select the right kernel from string ovkernel = PAIRWISE_KERNEL_FUNCTIONS['D' + self.input_kernel]( **kernel_params) else: raise NotImplementedError('unsupported kernel') return ovkernel(X)
[docs] def fit(self, X, y): """Fit OVK ridge regression model. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Training data. y : {array-like}, shape = [n_samples] or [n_samples, n_targets] Target values. numpy.NaN for missing targets (semi-supervised learning). Returns ------- self : returns an instance of self. """ X = check_array(X, force_all_finite=True, accept_sparse=False, ensure_2d=True) y = check_array(y, force_all_finite=False, accept_sparse=False, ensure_2d=False) if y.ndim == 1: y = check_array(y, force_all_finite=True, accept_sparse=False, ensure_2d=False) self._validate_params() self.linop_ = self._get_kernel_map(X, y) Gram = self.linop_._Gram(X) if self.lbda > 0: self.dual_coefs_ = dlyap(-Gram / self.lbda, self.linop_.A, y / self.lbda) else: # TODO: Check A is invertible!! self.dual_coefs_ = solve(Gram, y) return self
def _decision_function(self, X): pred = dot(dot(self.linop_._Gram(X), self.dual_coefs_), self.linop_.A) return reshape(pred, (X.shape[0], self.linop_.p)) \ if self.linop_.p > 1 else pred
[docs] def predict(self, X): """Predict using the OVK ridge model. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Samples. Returns ------- C : {array}, shape = [n_samples] or [n_samples, n_targets] Returns predicted values. """ check_is_fitted(self, ['dual_coefs_', 'linop_'], all_or_any=all) X = check_array(X, force_all_finite=True, accept_sparse=False, ensure_2d=True) return self._decision_function(X)
[docs]class OVKRidge(BaseEstimator, RegressorMixin): """Operator-Valued kernel ridge regression. Operator-Valued kernel ridge regression (OVKRR) combines ridge regression (linear least squares with l2-norm regularization) with the (OV)kernel trick. It learns a linear function in the space induced by the respective kernel and the data. For non-linear kernels, this corresponds to a non-linear function in the original space. Let n be the number of training points and p the number of outputs. This algorithm has a complexity per iterations of ... math:: \mathcal{O}(pn^2) + \mathcal{O}(p^2n) for the decomposable kernel. Hence, when the number of outputs is large, the solver OVKDecomposableRidge should be faster. The form of the model learned by OVKRR is identical to support vector regression (SVR). However, different loss functions are used: OVKRR uses squared error loss while support vector regression uses epsilon-insensitive loss, both combined with l2 regularization. In contrast to SVR, fitting a OVKRR model can be done in closed-form and is typically faster for medium-sized datasets. On the other hand, the learned model is non-sparse and thus slower than SVR, which learns a sparse model for epsilon > 0, at prediction-time. Optimization problem solved for learning: .. math:: \min_{h \in \mathcal H}~ \frac{\lambda}{2} \|h\|_{\mathcal H}^2 + \frac{1}{np} \sum_{i=1}^n \|y_j - h(x_i)\|_{\mathcal Y}^2 + \frac{\lambda_m}{2} \sum_{i=1}^n W_{ij} \|h(x_i) - h(x_j)\|_{\mathcal Y}^2 Attributes ---------- dual_coef_ : array, shape = [n_samples x n_targest] Weight vector(s) in kernel space linop_ : callable Callable which associate to the training points X the Gram matrix (the Gram matrix being a LinearOperator) A_ : array, shape = [n_targets, n_targets] Set when Linear operator used by the decomposable kernel is default or None. L_ : array, shape = [n_samples_miss, n_samples_miss] Graph Laplacian of data with missing targets (semi-supervised learning). period_ : float Set when period used by the First periodic kernel is 'autocorr'. solver_res_ : any Raw results returned by the solver. References ---------- * Micchelli, Charles A., and Massimiliano Pontil. "On learning vector-valued functions." Neural computation 17.1 (2005): 177-204. * Alvarez, Mauricio A., Lorenzo Rosasco, and Neil D. Lawrence. "Kernels for vector-valued functions: A review." arXiv preprint arXiv:1106.6251 (2011). APA * Brouard Celine, d'Alche-Buc Florence and Szafranski Marie. "Input Output Kernel Regression," Hal preprint hal-01216708 (2015). See also -------- sklearn.OVKRidge Linear ridge regression. sklearn.KernelOVKRidge Kernel ridge regression. sklearn.SVR Support Vector Regression implemented using libsvm. Examples -------- >>> import operalib as ovk >>> import numpy as np >>> n_samples, n_features, n_targets = 10, 5, 5 >>> rng = np.random.RandomState(0) >>> y = rng.randn(n_samples, n_targets) >>> X = rng.randn(n_samples, n_features) >>> clf = ovk.OVKRidge('DGauss', lbda=1.0) >>> clf.fit(X, y) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS OVKRidge(...) """ def __init__(self, ovkernel='DGauss', lbda=1e-5, lbda_m=0., A=None, gamma=None, gamma_m=None, theta=0.7, period='autocorr', autocorr_params=None, solver='L-BFGS-B', solver_params=None): """Initialize OVK ridge regression model. Parameters ---------- ovkernel : {string, callable}, default='DGauss' Kernel mapping used internally. A callable should accept two arguments, and should return a LinearOperator. lbda : {float}, default=1e-5 Small positive values of lbda improve the conditioning of the problem and reduce the variance of the estimates. Lbda corresponds to ``(2*C)^-1`` in other linear models such as LogisticRegression or LinearSVC. lbda_m : {float}, default=0. Regularization parameter for quadratic penalty on data with missing targets. A : {LinearOperator, array-like, sparse matrix}, default=None Linear operator used by the decomposable kernel. If default is None, wich is set to identity matrix of size y.shape[1] when fitting. gamma : {float}, default=None. Gamma parameter for the Decomposable Gaussian kernel. Ignored by other kernels. gamma_m : {float}, default=None. Gamma parameter for the graph Laplacian inducing penalty on data with missing targets. theta : {float}, default=.7 Theta parameter for the Decomposable First Periodic kernel. Ignored by other kernels. period : {float}, default=default_period Period parameter for the First periodic kernel. If optional modules have been imported then default_period is 2 * pi. Otherwise it uses autocorrelation methods to determine the period. solver : {callable}, default=scipy.optimize.fmin_l_bfgs_b Solver able to find the minimum of the ridge problem. scipy.optimize.fmin_l_bfgs_b(*solver_params)[0] must return the optimal solution. autocorr_params : {mapping of string to any} Additional parameters (keyword arguments) for the period detection for periodic kernels. If None, parameter choice is left to the period detection method. solver_params : {mapping of string to any}, optional Additional parameters (keyword arguments) for solver function passed as callable object. """ self.ovkernel = ovkernel self.lbda = lbda self.lbda_m = lbda_m self.A = A self.gamma = gamma self.gamma_m = gamma_m self.theta = theta self.period = period self.autocorr_params = autocorr_params self.solver = solver self.solver_params = solver_params def _validate_params(self): # check on self.ovkernel is performed in method __get_kernel if self.lbda < 0: raise ValueError('lbda must be positive') if self.lbda_m < 0: raise ValueError('lbda_m must be positive') # if self.A < 0: # Check whether A is S PD would be really expensive # raise ValueError('A must be a symmetric positive operator') if self.gamma is not None: if self.gamma < 0: raise ValueError('sigma must be positive or default (None)') if self.theta < 0: raise ValueError('theta must be positive') if isinstance(self.period, (int, float)): if self.period < 0: raise ValueError('period must be positive') # TODO, add supported solver check def _default_decomposable_op(self, y): # TODO: check NaN values (semi-sup learning) if self.A is not None: return self.A elif y.ndim == 2: return eye(y.shape[1]) else: return eye(1) def _default_period(self, X, y): if self.period is 'autocorr': autocorr_params = self.autocorr_params or {} return get_period(X, y, **autocorr_params) elif isinstance(self.period, (int, float)): return self.period else: raise ValueError('period must be a positive number or a valid ' 'string') def _get_kernel_map(self, X, y): # When adding a new kernel, update this table and the _get_kernel_map # method if callable(self.ovkernel): ovkernel = self.ovkernel elif type(self.ovkernel) is str: # 1) check string and assign the right parameters if self.ovkernel == 'DGauss': self.A_ = self._default_decomposable_op(y) kernel_params = {'A': self.A_, 'scalar_kernel': rbf_kernel, 'scalar_kernel_params': {'gamma': self.gamma}} elif self.ovkernel == 'DPeriodic': self.A_ = self._default_decomposable_op(y) self.period_ = self._default_period(X, y) kernel_params = {'A': self.A_, 'scalar_kernel': first_periodic_kernel, 'scalar_kernel_params': {'gamma': self.theta, 'period': self.period_}, } elif self.ovkernel == 'CurlF': kernel_params = {'gamma': self.gamma} else: raise NotImplementedError('unsupported kernel') # 2) Uses lookup table to select the right kernel from string ovkernel = PAIRWISE_KERNEL_FUNCTIONS[self.ovkernel]( **kernel_params) else: raise NotImplementedError('unsupported kernel') return ovkernel(X) def _decision_function(self, X): pred = self.linop_(X) * self.dual_coefs_ return reshape(pred, (X.shape[0], self.linop_.p)) \ if self.linop_.p > 1 else pred
[docs] def fit(self, X, y): """Fit OVK ridge regression model. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Training data. y : {array-like}, shape = [n_samples] or [n_samples, n_targets] Target values. numpy.NaN for missing targets (semi-supervised learning). Returns ------- self : returns an instance of self. """ X = check_array(X, force_all_finite=True, accept_sparse=False, ensure_2d=True) y = check_array(y, force_all_finite=False, accept_sparse=False, ensure_2d=False) if y.ndim == 1: y = check_array(y, force_all_finite=True, accept_sparse=False, ensure_2d=False) self._validate_params() solver_params = self.solver_params or {} self.linop_ = self._get_kernel_map(X, y) Gram = self.linop_(X) risk = OVKRidgeRisk(self.lbda) if not issubdtype(y.dtype, number): raise ValueError("Unknown label type: %r" % y.dtype) if y.ndim > 1: is_sup = ~all(isnan(y), axis=1) else: is_sup = ~isnan(y) if sum(~is_sup) > 0: self.L_ = _graph_Laplacian(rbf_kernel(X[~is_sup, :], gamma=self.gamma_m)) else: self.L_ = empty((0, 0)) p = y.shape[1] if y.ndim > 1 else 1 weight, zeronan = _SemisupLinop(self.lbda_m, is_sup, self.L_, p).gen() self.solver_res_ = minimize(risk.functional_grad_val, zeros(Gram.shape[1]), args=(y.ravel(), Gram, weight, zeronan), method=self.solver, jac=True, options=solver_params) self.dual_coefs_ = self.solver_res_.x return self
[docs] def predict(self, X): """Predict using the OVK ridge model. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Samples. Returns ------- C : {array}, shape = [n_samples] or [n_samples, n_targets] Returns predicted values. """ check_is_fitted(self, ['dual_coefs_', 'linop_'], all_or_any=all) X = check_array(X, force_all_finite=True, accept_sparse=False, ensure_2d=True) return self._decision_function(X)