Source code for clease.regression.sequential_cluster_ridge

import logging
import numpy as np
from .regression import LinearRegression, Tikhonov

logger = logging.getLogger(__name__)

__all__ = ("SequentialClusterRidge",)


[docs]class SequentialClusterRidge(LinearRegression): """ SequentialClusterRidge is a fit method that optimizes the LOOCV over the regularization parameter as well as the cluster support. The method adds features in the design matrix X (see `fit` method) by including column by column. For each set of columns it performs a fit to a logspaced set of regularization parameters. The returned coefficients are the one from the model that has the smallest LOOCV. Parameters: alpha_min: float Minimum value of the regularization parameter alpha alpha_max: float Maximum value of the regularization parameter alpha num_alpha: int Number of alpha values verbose: bool Print information about fit after completion """ def __init__(self, min_alpha=1e-10, max_alpha=10.0, num_alpha=20, verbose: bool = False): super().__init__() self.min_alpha = min_alpha self.max_alpha = max_alpha self.num_alpha = num_alpha self.verbose = verbose @staticmethod def _cv(X, coeff, y, l2scheme): """ Calcualtes the cross validation score """ pred = X.dot(coeff) prec = l2scheme.precision_matrix(X) dy = y - pred dy_loo = dy / (1 - np.diag(X.dot(prec).dot(X.T))) return np.sqrt(np.mean(dy_loo**2)) @staticmethod def _print_summary(cvs, coeffs): """ Prints a summary of the search """ srt_idx = np.argsort(cvs) print("--------------------------------------------") print(" SUPPORT EXPANDING L2 SUMMARY ") print("--------------------------------------------") for i in range(20): print( f"Num. coeff: {len(coeffs[srt_idx[i]]):9d} " f"CV: {cvs[srt_idx[i]]:9.3f} meV/atom" ) print("--------------------------------------------")
[docs] def fit(self, X, y): """ Performs the fitting Parameters: X: np.ndarray Design matrix of size (N x M). During the CV optimization columns of X will be added one by one starting with a model consisting of the two first columns. y: np.ndarray Vector of length N """ numFeat = X.shape[1] alphas = np.logspace(np.log10(self.min_alpha), np.log10(self.max_alpha), self.num_alpha) coeffs = [] cvs = [] for i in range(2, numFeat): for alpha in alphas: scheme = Tikhonov(alpha=alpha) design = X[:, :i] coeff = scheme.fit(design, y) cv = self._cv(design, coeff, y, scheme) cvs.append(cv) coeffs.append(coeff) best_cv = np.argmin(cvs) res = np.zeros(numFeat) best_coeff = coeffs[best_cv] res[: len(best_coeff)] = best_coeff if self.verbose: self._print_summary(cvs, coeffs) return res