Source code for preprocessing

"""
This module contains several methods of preprocessing the training and target
values for both the GP Regressor and acquisition module. Instances of different
preprocessors can be chained together thereby building a *pipeline*.

The preprocessors are implemented into the GP Acquisition and GP Regressor
module in a way which performs the transformations *behind the scenes* meaning
that the user can work in the non-transformed space and all transformations will
be performed internally.

You can build your own preprocessor if you want. This requires you to build a
custom class. How to do that for X- and y-preprocessors is explained in the
:class:`Pipeline_X` and :class:`Pipeline_y` classes respectively.

NB: all ``transform``-like methods should return a copy of the input, but avoid
unnecessary ``copy`` statements.
"""

import warnings
from numbers import Number
from itertools import product

import numpy as np
from scipy.linalg import eigh, LinAlgError

from gpry.tools import delta_logp_of_1d_nstd


[docs] class DummyPreprocessor: is_linear = True
[docs] @classmethod def fit(cls, *args, **kwargs): pass
[docs] @classmethod def transform_bounds(cls, bounds): return bounds
[docs] @classmethod def transform(cls, _): return _
[docs] @classmethod def inverse_transform(cls, _): return _
[docs] @classmethod def transform_scale(cls, _): return _
[docs] @classmethod def inverse_transform_scale(cls, _): return _
[docs] class Pipeline_X: """ Used for building a pipeline for preprocessing X-values. This is provided with a list of preprocessors in the order they shall be applied. The ``transform_scale``, ``fit``, ``transform`` and ``inverse_transform`` methods can then be called as if the pipeline was a single preprocessor. Parameters ---------- preprocessors : list of preprocessors for X. The preprocessors in the order that the transformations shall be transformed. These need to either be inbuilt preprocessors or you can build your own preprocessor. For this you need to build a custom class with the signature:: from gpry.preprocessing import Preprocessor class My_X_preprocessor: def __init__(self, ...): # Add here any objects that the preprocessor might need ... def fit(self, X, y): # This method should fit the transformation # (if neccessary). ... return self def transform_bounds(self, bounds): # This method should transform the bounds of the prior. If # the bounds remain unchanged after the transformation this # method should just return the (untransformed) bounds. ... return transformed_bounds def transform(self, X): # This method transforms the X-data. For this the fit # method has to have been called before at least once. ... return X_transformed def inverse_transform(self, X): # Applies the inverse transformation to ``transform``. ... return inverse_transformed_X def transform_scale(self, scale): # This method should transform a scale in X, e.g. the bounds # of the prior. ... return transformed_scale def transform_scale(self, scale): # Applies the inverse transform to ``transform_scale``. ... return transformed_scale .. note:: All the preprocessor objects need to be initialized! Furthermore ``transform` and ``inverse_transform`` need to preserve the shape of X. All transformations must return a copy (but avoid unnecessary copy statements). """ def __init__(self, preprocessors): self.preprocessors = preprocessors self.fitted = False
[docs] def transform_bounds(self, bounds): transformed_bounds = bounds for preprocessor in self.preprocessors: transformed_bounds = preprocessor.transform_bounds(transformed_bounds) return transformed_bounds
[docs] def fit(self, X, y): """ Consecutively fit several preprocessors by passing the transformed data through each one and fitting. """ X_transformed = X for preprocessor in self.preprocessors: preprocessor.fit(X_transformed, y) X_transformed = preprocessor.transform(X_transformed) self.fitted = True return self
[docs] def transform(self, X): """ Transform the data through the pipeline """ X_transformed = X for preprocessor in self.preprocessors: X_transformed = preprocessor.transform(X_transformed) return X_transformed
[docs] def inverse_transform(self, X): """ Inverse transform the data through the pipeline (by applying each inverse transformation in reverse order). """ X_transformed = X for preprocessor in reversed(self.preprocessors): X_transformed = preprocessor.inverse_transform(X_transformed) return X_transformed
[docs] def transform_scale(self, scale): transformed_scale = scale for preprocessor in self.preprocessors: transformed_scale = preprocessor.transform_scale(transformed_scale) return transformed_scale
[docs] def inverse_transform_scale(self, scale): """ Inverse transform the data through the pipeline (by applying each inverse transformation in reverse order). """ transformed_scale = scale for preprocessor in reversed(self.preprocessors): transformed_scale = preprocessor.inverse_transform_scale(transformed_scale) return transformed_scale
# TODO: finish and fix
[docs] class Whitening: r""" A class which can pre-transform the posterior in a way that it matches a multivariate normal distribution during the Regression step. This is done in the hope that by matching a normal distribution the GP will converge faster. The transformation used is the *whitening* transformation which is given by .. math:: X_k^i \to \frac{\mathbf{R}^{ij} (X_k^j - m^j)}{\sigma^i}\ . :math:`\mathbf{R}` is the matrix which solves :math:`\mathbf{C} = \mathbf{R}\mathbf{\Lambda}\mathbf{R}` where :math:`\mathbf{C}` is the empirical covariance matrix (along the dimensions of X) and :math:`\mathbf{\Lambda}` a diagonal matrix. :math:`m^j` the empirical mean and :math:`\sigma = \sqrt{\mathbf{C}^{ii}}` the empirical standard deviation. This can help with highly anisotropic distributions, especially if degeneracy directions are known a priori. When adapting it with `learn=True`, this may not be very numerically robust, since the empirical mean and standard deviation are weighted by the posterior values which have a high dynamical range. An anisotropic kernel may be preferred. """ def __init__(self, bounds, mean=None, cov=None, learn=False): self.transf_matrix, self.inv_transf_matrix = None, None if cov is None: if not learn: raise ValueError("Needs a cov, or to be able to learn it `learn=True`.") else: try: self.transf_matrix, self.inv_transf_matrix = self.prepare_transform(cov) except ValueError as excpt: raise ValueError( f"Cannot initialize whitening transform: {excpt}" ) from excpt self.cov = cov self.learn = learn # If mean is None at the beginning, but cov is defined, assume central point. # NB: if cov undefined, mean is ignored. if mean is None: if self.cov is not None: warnings.warn( "Cov passed but not mean. Using the center of the prior hypercube." ) bounds_arr = np.array(bounds) mean = (bounds_arr[:, 0] + bounds_arr[:, 1]) / 2 self.mean = mean
[docs] @staticmethod def prepare_transform(cov): """ Compute the relevant elements for the transform from the mean and covmat. Raises `ValueError` if it fails at the eigen-decomposition. """ try: eigenvals, eigenvecs = eigh(cov) except LinAlgError as excpt: raise ValueError( f"Could not compute the eigen-decomposition of the covmat: {excpt}" ) from excpt transf_matrix = np.diag(eigenvals**-0.5) @ eigenvecs.T inv_transf_matrix = eigenvecs @ np.diag(eigenvals**0.5) return transf_matrix, inv_transf_matrix
[docs] @staticmethod def compute_mean_cov(X, logp): """ Computes mean and cov using the given points weighted by the given log-probabilities. Raises ValueError if failed to get a non-singular covmat. """ with warnings.catch_warnings(): # Raise exception for all warnings to catch them. warnings.filterwarnings("error") try: logp_exp = np.exp(logp - np.max(logp)) mean = np.average(X, axis=0, weights=logp_exp) cov = np.cov(X, aweights=logp_exp, ddof=0) except (ZeroDivisionError, TypeError, ValueError, RuntimeWarning) as excpt: raise ValueError( f"Could not compute covmat with the given points: {excpt}" ) from excpt return mean, cov
[docs] def fit(self, X, y): """ Fits the whitening transformation, if initialised with `learn=True`. If an error is encountered, keeps the previous transform. """ if not self.learn: return self warn_msg = "Could not fit a whitening transformation." warn_msg_end = "Keeping previous transfrom." try: self.mean, self.cov = self.compute_mean_cov(X, y) self.transf_matrix, self.inv_transf_matrix = self.prepare_transform(self.cov) except ValueError as excpt: warnings.warn(warn_msg + str(excpt) + warn_msg_end) return self
[docs] def transform(self, X): if self.cov is None: # adaptive (or could not initialise), but not fitted yet return np.copy(X) return (self.transf_matrix @ (X - self.mean).T).T
[docs] def inverse_transform(self, X): if self.cov is None: # adaptive (or could not initialise), but not fitted yet return np.copy(X) return (self.inv_transf_matrix @ X.T).T + self.mean
[docs] def transform_bounds(self, bounds): if self.cov is None: # adaptive (or could not initialise), but not fitted yet return bounds vertices = np.array(list(product(*bounds))) transf_vertices = self.transform(vertices) transf_bounds = np.array( [np.min(transf_vertices.T, axis=1), np.max(transf_vertices.T, axis=1)] ).T print("bounds:", bounds) print("vertices:", vertices) print("transf_vertices:", transf_vertices) print("transf_bounds:", transf_bounds) return transf_bounds
[docs] class Normalize_bounds: """ A class which transforms all bounds of the prior such that the prior hypervolume occupies the unit hypercube in the interval [0, 1]. This is done because of two reasons: #. Confining the bounds while fitting the GP regressor ensures that the hyperparameters of the GP (particularly length-scales) are within the same order of magnitude if one assumes that the non-zero region of the posterior occupies roughly the same fraction of the prior in each direction. This is a reasonable assumption for most realistic cases. #. If the with of the posterior distribution is similar along every dimension this makes it far easier for the optimizer of the acquisition function to navigate the acquisition function space (which has the same number of dimensions as the training data) especially if the optimizer uses a fixed jump-length. Parameters ---------- bounds : array-like, shape = (n_dims, 2) Bounds [lower, upper] along each dimension. Attributes ---------- transformed_bounds : array-like, shape = (n_dims, 2) Array with [0, 1] along every dimension. bounds_min : array-like, shape = (n_dims,) Lower bounds along every dimension. bounds_max : array-like, shape = (n_dims,) Upper bounds along every dimension. """ def __init__(self, bounds): self.update_bounds(bounds) self.fitted = True # only needs fitting at init
[docs] def update_bounds(self, bounds): bounds = np.asarray(bounds) self.bounds = bounds self.bounds_min = bounds[:, 0] self.bounds_max = bounds[:, 1] if np.any(self.bounds_min > self.bounds_max): raise ValueError( "The bounds must be in dimension-wise order " "min->max, got \n" + bounds )
[docs] def transform_bounds(self, bounds): transformed_bounds = np.ones_like(bounds) transformed_bounds[:, 0] = 0 return transformed_bounds
[docs] def fit(self, X, y): """Fits the transformer (which in reality does nothing)"""
[docs] def transform(self, X): """Transforms X so that all values lie between 0 and 1. Parameters ---------- X : array-like, shape = (n_samples, n_dims) X-values that one wants to transform. Must be between bounds. Returns ------- X_transformed : array-like, shape = (n_samples, n_dims) Transformed X-values """ return (X - self.bounds_min) / (self.bounds_max - self.bounds_min)
[docs] def inverse_transform(self, X): """Applies the inverse transformation Parameters ---------- X : array-like, shape = (n_samples, n_dims) Transformed X-values between 0 and 1. Returns ------- X : array-like, shape = (n_samples, n_dims) Inverse transformed (original) values. """ return (X * (self.bounds_max - self.bounds_min)) + self.bounds_min
[docs] def inverse_transform_scale(self, X): """Applies the inverse transformation to an unbounded scale (e.g. the kernel length scale). Parameters ---------- X : array-like, shape = (n_samples, n_dims) Transformed X-values between 0 and 1. Returns ------- X : array-like, shape = (n_samples, n_dims) Inverse transformed (original) values. """ return X * (self.bounds_max - self.bounds_min)
[docs] class Pipeline_y: """ Used for building a pipeline for preprocessing y-values. This is provided with a list of preprocessors in the order they shall be applied. The ``fit``, ``transform`` and ``inverse_transform`` methods can then be called as if the pipeline was a single preprocessor. Parameters ---------- preprocessors : list of preprocessors for y. The preprocessors in the order that the transformations shall be transformed. These need to either be inbuilt preprocessors or you can build your own preprocessor. For this you need to build a custom class with the signature:: class My_y_preprocessor: def __init__(self, ...): # Add here any objects that the preprocessor might need ... def fit(self, X, y): # This method should fit the transformation # (if neccessary). ... return self def transform(self, y): # This method transforms the y-data. For this the fit # method has to have been called before at least once. ... return transformed_y def inverse_transform(self, y): # Applies the inverse transformation to ``transform``. ... return inverse_transformed_y def transform_scale(self, scale): # This method should transform a scale-like quantity # (e.g. the noise level of the training data) such that # it represents the corresponding scale # of the transformed data. ... return transformed_scale def inverse_transform_scale(self, scale): # This method should invert the transformation applied by # ``transform_scale``. ... return inverse_transformed_scale .. note:: All the preprocessor objects need to be initialized! Furthermore the `transform` and `inverse_transform` methods need to preserve the shape of y. In contrast to the preprocessors for X this does not need to contain a method to transform bounds, but it still needs one to transform scales such as the the noise level (alpha). """ def __init__(self, preprocessors): self.preprocessors = preprocessors self.fitted = False
[docs] def fit(self, X, y): """ Consecutively fit several preprocessors by passing the transformed data through each one and fitting. """ y_transformed = y for preprocessor in self.preprocessors: preprocessor.fit(X, y_transformed) y_transformed = preprocessor.transform(y_transformed) self.fitted = True return self
[docs] def transform(self, y): """ Transform the data through the pipeline """ y_transformed = y for preprocessor in self.preprocessors: y_transformed = preprocessor.transform(y_transformed) return y_transformed
[docs] def inverse_transform(self, y): """ Inverse transform the data through the pipeline (by applying each inverse transformation in reverse order). """ y_transformed = y for preprocessor in reversed(self.preprocessors): y_transformed = preprocessor.inverse_transform(y_transformed) return y_transformed
[docs] def transform_scale(self, scale): """ Transforms the scale through the pipeline """ scale_transformed = scale for preprocessor in self.preprocessors: scale_transformed = preprocessor.transform_scale(scale_transformed) return scale_transformed
[docs] def inverse_transform_scale(self, scale): """ Inverse transforms the scale through the pipeline """ scale_transformed = scale for preprocessor in reversed(self.preprocessors): scale_transformed = preprocessor.inverse_transform_scale(scale_transformed) return scale_transformed
[docs] class Normalize_y: """ Transforms y-values (target values) such that they are centered around 0 with a standard deviation of 1. This is done so that the constant pre-factor in the kernel (constant kernel) stays within a numerically convenient range. Attributes ---------- mean_ : float Mean of the y-values std_ : float Standard deviation of the y-values **Methods:** .. autosummary:: :toctree: stubs transform inverse_transform """ def __init__(self, use_median=False): self.mean_ = None self.std_ = None self.use_median = bool(use_median) if self.use_median: self.get_mean_std = lambda y: (lambda y25, y50, y75: (y50, y75 - y25))( *np.percentile(y, [25, 50, 75]) ) else: self.get_mean_std = lambda y: (np.mean(y), np.std(y)) @property def is_linear(self): return True @property def fitted(self): return self.mean_ is not None and self.std_ is not None
[docs] def fit(self, X, y): """ Calculates the mean and standard deviation of y and saves them. Parameters ---------- X : array-like, shape = (n_samples, dimension) X-values (target values) that can be used to tune the transformation. determine the mean and std. y : array-like, shape = (n_samples,) y-values (target values) that are used to determine the mean and std. """ self.mean_, self.std_ = self.get_mean_std(y[np.isfinite(y)])
[docs] def transform(self, y): """Transforms y. Parameters ---------- y : array-like, shape = (n_samples,) y-values that one wants to transform. Returns ------- y_transformed : array-like, shape = (n_samples,) Transformed y-values """ if not self.fitted: raise TypeError("mean_ and std_ have not been fit before") return (y - self.mean_) / self.std_
[docs] def inverse_transform(self, y): """Applies inverse transformation to y. Parameters ---------- y_transformed : array-like, shape = (n_samples,) Transformed y-values. Returns ------- y : array-like, shape = (n_samples,) Original y-values. """ if not self.fitted: raise TypeError("mean_ and std_ have not been fit before") return (y * self.std_) + self.mean_
[docs] def transform_scale(self, scale): if not self.fitted: raise TypeError("mean_ and std_ have not been fit before") return scale / self.std_ # Divide by the standard deviation
[docs] def inverse_transform_scale(self, scale): if not self.fitted: raise TypeError("mean_ and std_ have not been fit before") return scale * self.std_ # Multiply by the standard deviation
[docs] class NormalizeChi2_y(Normalize_y): """ Transforms y-values (target values) such that they are centered around the Gaussian 1-sigma value with respect to the largest logp, so that the standard deviation is the distance between the maximum and the value that defines that 0. Attributes ---------- mean_ : float Mean of the y-values std_ : float Standard deviation of the y-values **Methods:** .. autosummary:: :toctree: stubs transform inverse_transform """ def __init__(self, nsigma=1): try: assert isinstance(nsigma, Number) and nsigma > 0 except TypeError as excpt: raise TypeError( f"nsigma must be a positive number. Got {nsigma} of type {type(nsigma)}." ) from excpt self.nsigma = nsigma self.delta_logp = None super().__init__()
[docs] def fit(self, X, y): """ Calculates the mean and standard deviation of y and saves them. Parameters ---------- X : array-like, shape = (n_samples, dimension) X-values (target values) that can be used to tune the transformation. determine the mean and std. y : array-like, shape = (n_samples,) y-values (target values) that are used to determine the mean and std. """ dim = np.atleast_2d(X).shape[1] self.delta_logp = delta_logp_of_1d_nstd(self.nsigma, dim) self.mean_ = max(y) - self.delta_logp self.std_ = self.delta_logp