Source code for clease.regression.regression

"""Collection of classess to perform regression."""
from typing import Union, Optional, List
import numpy as np
from numpy.linalg import pinv
from sklearn.linear_model import Lasso as skLasso
from scipy.linalg import lstsq
from clease.data_normalizer import DataNormalizer

__all__ = ("LinearRegression", "Tikhonov", "Lasso")


[docs]class LinearRegression: def __init__(self) -> None: self._weight_matrix = None self.tol = 1e-8 @property def weight_matrix(self): return self._weight_matrix def _ensure_weight_matrix_consistency(self, data: np.ndarray) -> None: """Raise an error if the dimensions of the weight matrix is not consistent. :param data: y-values in the fit """ if self._weight_matrix is not None: if self._weight_matrix.shape[1] != len(data): raise ValueError( f"The provided weight matrix needs to have " f"dimension {len(data)}x{len(data)}, " f"{self._weight_matrix.shape} given" )
[docs] def fit(self, X: np.ndarray, y: np.ndarray) -> np.ndarray: """Fit a linear model by performing ordinary least squares y = Xc :param X: Design matrix (NxM) :param y: Data points (vector of length N) """ self._ensure_weight_matrix_consistency(y) return lstsq(X, y)[0] # Only fetch the solution
def precision_matrix(self, X: np.ndarray) -> np.ndarray: D, V_h = np.linalg.svd(X, full_matrices=False)[1:] # U is unused V = V_h.T diag = np.zeros_like(D) mask = np.abs(D) > self.tol diag[mask] = 1.0 / D[mask] ** 2 return V.dot(np.diag(diag)).dot(V.T) def get_instance_array(self) -> List[object]: # pylint: disable=no-self-use return [LinearRegression()] def is_scalar(self): # pylint: disable=no-self-use return False def get_scalar_parameter(self): # pragma: no cover # pylint: disable=no-self-use raise ValueError("Fitting scheme is not described by a scalar parameter!") @property def support_fast_loocv(self): return True
[docs]class Tikhonov(LinearRegression): """Ridge regularization. :param alpha: regularization term - float: A single regularization coefficient is used for all features. Tikhonov matrix is T = alpha * I (I = identity matrix). - 1D array: Regularization coefficient is defined for each feature. Tikhonov matrix is T = diag(alpha) (the alpha values are put on the diagonal). The length of array should match the number of features. - 2D array: Full Tikhonov matrix supplied by a user. The dimensions of the matrix should be M * M where M is the number of features. :param normalize: If True each feature will be normalized to before fitting """ def __init__( self, alpha: Union[float, np.ndarray] = 1e-5, penalize_bias_term: bool = False, normalize: bool = True, ) -> None: super().__init__() self.alpha = alpha self.penalize_bias_term = penalize_bias_term self.normalize = normalize def _get_tikhonov_matrix(self, num_clusters: int) -> np.ndarray: if isinstance(self.alpha, np.ndarray): if len(self.alpha.shape) == 1: tikhonov = np.diag(self.alpha) elif len(self.alpha.shape) == 2: tikhonov = self.alpha else: raise ValueError("Matrix have to have dimension 1 or 2") else: # Alpha is a floating point number tikhonov = np.identity(num_clusters) if not self.penalize_bias_term: tikhonov[0, 0] = 0.0 tikhonov *= np.sqrt(self.alpha) return tikhonov
[docs] def fit(self, X: np.ndarray, y: np.ndarray) -> np.ndarray: """Fit coefficients based on Ridge regularizeation.""" self._ensure_weight_matrix_consistency(y) if self.weight_matrix is None: W = np.ones(len(y)) else: W = np.diag(self.weight_matrix) X_fit = X y_fit = y if self.normalize: if np.any(np.abs(X[:, 0] - 1.0) > 1e-16): msg = "Tikhonov: When normalize=True. The first column in X " msg += "should correspond to a to a bias term. Therefore, all " msg += f"entries should be 1.\nGot:\n{X[:, 0]}\n" raise ValueError(msg) normalizer = DataNormalizer() X_fit, y_fit = normalizer.normalize(X[:, 1:], y) precision = self.precision_matrix(X_fit) coeff = precision.dot(X_fit.T.dot(W * y_fit)) if self.normalize: coeff_with_bias = np.zeros(len(coeff) + 1) coeff_with_bias[1:] = normalizer.convert(coeff) coeff_with_bias[0] = normalizer.bias(coeff) coeff = coeff_with_bias return coeff
[docs] def precision_matrix(self, X: np.ndarray) -> np.ndarray: """Calculate the presicion matrix.""" num_features = X.shape[1] tikhonov = self._get_tikhonov_matrix(num_features) if tikhonov.shape != (num_features, num_features): raise ValueError( "The dimensions of Tikhonov matrix do not match the number of clusters!" ) W = self.weight_matrix if W is None: W = np.eye(X.shape[0]) precision = pinv(X.T.dot(W.dot(X)) + tikhonov.T.dot(tikhonov)) return precision
@staticmethod def get_instance_array( alpha_min: float, alpha_max: float, num_alpha: int = 10, scale: Optional[str] = "log", ) -> List[object]: # pylint: disable=arguments-differ if scale == "log": alpha = np.logspace( np.log10(alpha_min), np.log10(alpha_max), int(num_alpha), endpoint=True ) else: alpha = np.linspace(alpha_min, alpha_max, int(num_alpha), endpoint=True) return [Tikhonov(alpha=a) for a in alpha] def is_scalar(self) -> bool: return isinstance(self.alpha, float) def get_scalar_parameter(self) -> float: if self.is_scalar(): return self.alpha return super().get_scalar_parameter()
[docs]class Lasso(LinearRegression): """LASSO regularization. :param alpha: regularization coefficient :param max_iter: (int) Maximum number of iterations. """ def __init__(self, alpha: float = 1e-5, max_iter: int = int(1e6)) -> None: super().__init__() self.alpha = alpha self.max_iter = max_iter @property def max_iter(self) -> int: return self._max_iter @max_iter.setter def max_iter(self, value) -> None: # sklearn Lasso class must have integer max_iter values. self._max_iter = int(value)
[docs] def fit(self, X: np.ndarray, y: np.ndarray) -> np.ndarray: """Fit coefficients based on LASSO regularizeation.""" lasso = skLasso(alpha=self.alpha, fit_intercept=False, copy_X=True, max_iter=self.max_iter) lasso.fit(X, y) return lasso.coef_
@property def weight_matrix(self): return LinearRegression.weight_matrix(self) @staticmethod def get_instance_array( alpha_min: float, alpha_max: float, num_alpha: int = 10, scale: str = "log" ) -> List[object]: # pylint: disable=arguments-differ if scale == "log": alpha = np.logspace( np.log10(alpha_min), np.log10(alpha_max), int(num_alpha), endpoint=True ) else: alpha = np.linspace(alpha_min, alpha_max, int(num_alpha), endpoint=True) return [Lasso(alpha=a) for a in alpha] def is_scalar(self): return True def get_scalar_parameter(self) -> float: return self.alpha def precision_matrix(self, X): # pragma: no cover raise NotImplementedError("Precision matrix for LASSO is not implemented.") @property def support_fast_loocv(self): return False