Source code for clease.regression.regression

"""Collection of classess to perform regression."""
from typing import Union, Optional, List
import numpy as np
from numpy.linalg import pinv
from sklearn.linear_model import Lasso as skLasso
from scipy.linalg import lstsq
from clease.data_normalizer import DataNormalizer

__all__ = ("LinearRegression", "Tikhonov", "Lasso")


[docs]class LinearRegression:
    def __init__(self) -> None:
        self._weight_matrix = None
        self.tol = 1e-8

    @property
    def weight_matrix(self):
        return self._weight_matrix

    def _ensure_weight_matrix_consistency(self, data: np.ndarray) -> None:
        """Raise an error if the dimensions of the
           weight matrix is not consistent.

        :param data: y-values in the fit
        """
        if self._weight_matrix is not None:
            if self._weight_matrix.shape[1] != len(data):
                raise ValueError(
                    f"The provided weight matrix needs to have "
                    f"dimension {len(data)}x{len(data)}, "
                    f"{self._weight_matrix.shape} given"
                )

[docs]    def fit(self, X: np.ndarray, y: np.ndarray) -> np.ndarray:
        """Fit a linear model by performing ordinary least squares

        y = Xc

        :param X: Design matrix (NxM)

        :param y: Data points (vector of length N)
        """
        self._ensure_weight_matrix_consistency(y)
        return lstsq(X, y)[0]  # Only fetch the solution

    def precision_matrix(self, X: np.ndarray) -> np.ndarray:
        D, V_h = np.linalg.svd(X, full_matrices=False)[1:]  # U is unused
        V = V_h.T
        diag = np.zeros_like(D)
        mask = np.abs(D) > self.tol
        diag[mask] = 1.0 / D[mask] ** 2
        return V.dot(np.diag(diag)).dot(V.T)

    def get_instance_array(self) -> List[object]:
        # pylint: disable=no-self-use
        return [LinearRegression()]

    def is_scalar(self):
        # pylint: disable=no-self-use
        return False

    def get_scalar_parameter(self):  # pragma: no cover
        # pylint: disable=no-self-use
        raise ValueError("Fitting scheme is not described by a scalar parameter!")

    @property
    def support_fast_loocv(self):
        return True


[docs]class Tikhonov(LinearRegression):
    """Ridge regularization.

    :param alpha: regularization term

        - float: A single regularization coefficient is used for all features.
                 Tikhonov matrix is T = alpha * I (I = identity matrix).

        - 1D array: Regularization coefficient is defined for each feature.
                    Tikhonov matrix is T = diag(alpha) (the alpha values are
                    put on the diagonal).
                    The length of array should match the number of features.

        - 2D array: Full Tikhonov matrix supplied by a user.
                    The dimensions of the matrix should be M * M where M is the
                    number of features.

    :param normalize: If True each feature will be normalized to before fitting
    """

    def __init__(
        self,
        alpha: Union[float, np.ndarray] = 1e-5,
        penalize_bias_term: bool = False,
        normalize: bool = True,
    ) -> None:
        super().__init__()
        self.alpha = alpha
        self.penalize_bias_term = penalize_bias_term
        self.normalize = normalize

    def _get_tikhonov_matrix(self, num_clusters: int) -> np.ndarray:
        if isinstance(self.alpha, np.ndarray):
            if len(self.alpha.shape) == 1:
                tikhonov = np.diag(self.alpha)
            elif len(self.alpha.shape) == 2:
                tikhonov = self.alpha
            else:
                raise ValueError("Matrix have to have dimension 1 or 2")
        else:
            # Alpha is a floating point number
            tikhonov = np.identity(num_clusters)
            if not self.penalize_bias_term:
                tikhonov[0, 0] = 0.0
            tikhonov *= np.sqrt(self.alpha)
        return tikhonov

[docs]    def fit(self, X: np.ndarray, y: np.ndarray) -> np.ndarray:
        """Fit coefficients based on Ridge regularizeation."""
        self._ensure_weight_matrix_consistency(y)

        if self.weight_matrix is None:
            W = np.ones(len(y))
        else:
            W = np.diag(self.weight_matrix)

        X_fit = X
        y_fit = y
        if self.normalize:
            if np.any(np.abs(X[:, 0] - 1.0) > 1e-16):
                msg = "Tikhonov: When normalize=True. The first column in X "
                msg += "should correspond to a to a bias term. Therefore, all "
                msg += f"entries should be 1.\nGot:\n{X[:, 0]}\n"
                raise ValueError(msg)
            normalizer = DataNormalizer()
            X_fit, y_fit = normalizer.normalize(X[:, 1:], y)

        precision = self.precision_matrix(X_fit)
        coeff = precision.dot(X_fit.T.dot(W * y_fit))

        if self.normalize:
            coeff_with_bias = np.zeros(len(coeff) + 1)
            coeff_with_bias[1:] = normalizer.convert(coeff)
            coeff_with_bias[0] = normalizer.bias(coeff)
            coeff = coeff_with_bias
        return coeff

[docs]    def precision_matrix(self, X: np.ndarray) -> np.ndarray:
        """Calculate the presicion matrix."""
        num_features = X.shape[1]
        tikhonov = self._get_tikhonov_matrix(num_features)

        if tikhonov.shape != (num_features, num_features):
            raise ValueError(
                "The dimensions of Tikhonov matrix do not match the number of clusters!"
            )

        W = self.weight_matrix
        if W is None:
            W = np.eye(X.shape[0])
        precision = pinv(X.T.dot(W.dot(X)) + tikhonov.T.dot(tikhonov))
        return precision

    @staticmethod
    def get_instance_array(
        alpha_min: float,
        alpha_max: float,
        num_alpha: int = 10,
        scale: Optional[str] = "log",
    ) -> List[object]:
        # pylint: disable=arguments-differ
        if scale == "log":
            alpha = np.logspace(
                np.log10(alpha_min), np.log10(alpha_max), int(num_alpha), endpoint=True
            )
        else:
            alpha = np.linspace(alpha_min, alpha_max, int(num_alpha), endpoint=True)
        return [Tikhonov(alpha=a) for a in alpha]

    def is_scalar(self) -> bool:
        return isinstance(self.alpha, float)

    def get_scalar_parameter(self) -> float:
        if self.is_scalar():
            return self.alpha
        return super().get_scalar_parameter()


[docs]class Lasso(LinearRegression):
    """LASSO regularization.

    :param alpha: regularization coefficient
    :param max_iter: (int) Maximum number of iterations.
    """

    def __init__(self, alpha: float = 1e-5, max_iter: int = int(1e6)) -> None:
        super().__init__()
        self.alpha = alpha
        self.max_iter = max_iter

    @property
    def max_iter(self) -> int:
        return self._max_iter

    @max_iter.setter
    def max_iter(self, value) -> None:
        # sklearn Lasso class must have integer max_iter values.
        self._max_iter = int(value)

[docs]    def fit(self, X: np.ndarray, y: np.ndarray) -> np.ndarray:
        """Fit coefficients based on LASSO regularizeation."""
        lasso = skLasso(alpha=self.alpha, fit_intercept=False, copy_X=True, max_iter=self.max_iter)
        lasso.fit(X, y)
        return lasso.coef_

    @property
    def weight_matrix(self):
        return LinearRegression.weight_matrix(self)

    @staticmethod
    def get_instance_array(
        alpha_min: float, alpha_max: float, num_alpha: int = 10, scale: str = "log"
    ) -> List[object]:
        # pylint: disable=arguments-differ
        if scale == "log":
            alpha = np.logspace(
                np.log10(alpha_min), np.log10(alpha_max), int(num_alpha), endpoint=True
            )
        else:
            alpha = np.linspace(alpha_min, alpha_max, int(num_alpha), endpoint=True)
        return [Lasso(alpha=a) for a in alpha]

    def is_scalar(self):
        return True

    def get_scalar_parameter(self) -> float:
        return self.alpha

    def precision_matrix(self, X):  # pragma: no cover
        raise NotImplementedError("Precision matrix for LASSO is not implemented.")

    @property
    def support_fast_loocv(self):
        return False