diff --git a/BLcourse2.3/01_one_dim.py b/BLcourse2.3/01_one_dim.py index 66bafc5b89f8a4b4f30736d69ac18a0f153a31a6..ec72e4e1e1c758554ad1adfc97838a8f9479567f 100644 --- a/BLcourse2.3/01_one_dim.py +++ b/BLcourse2.3/01_one_dim.py @@ -32,6 +32,14 @@ # we still denote the collection of all $\ve x_i = x_i\in\mathbb R$ points with # $\ma X$ to keep the notation consistent with the slides. + +# # About +# +# In this notebook, we explore the material presented in [the +# slides](https://doi.org/10.6084/m9.figshare.25988176) with code, using the +# [gpytorch](https://gpytorch.ai) library. We cover exact GP inference and +# hyper parameter optimization using the marginal likelihood. + # # Imports, helpers, setup # ##%matplotlib notebook diff --git a/BLcourse2.3/02_two_dim.py b/BLcourse2.3/02_two_dim.py index c0c3361717acca16379ed8155a965ac79eca284a..35311dec6507c821d60ad34d54b8a5909cbca4ba 100644 --- a/BLcourse2.3/02_two_dim.py +++ b/BLcourse2.3/02_two_dim.py @@ -13,7 +13,13 @@ # name: python3 # --- -# In this notebook, we use a GP to fit a 2D data set. +# # About +# +# In this notebook, we use a GP to fit a 2D data set. We use the same ExactGP +# machinery as in the 1D case and show how GPs can be used for 2D interpolation +# (when data is free of noise) or regression (noisy data). Think of this as a +# toy geospatial data setting. Actually, in geostatistics, Gaussian process +# regression is known as [Kriging](https://en.wikipedia.org/wiki/Kriging). # $\newcommand{\ve}[1]{\mathit{\boldsymbol{#1}}}$ # $\newcommand{\ma}[1]{\mathbf{#1}}$ # $\newcommand{\pred}[1]{\rm{#1}}$ @@ -50,6 +56,13 @@ torch.set_default_dtype(torch.float64) torch.manual_seed(123) # # Generate toy 2D data +# +# Our ground truth function is $f(\ve x) = \sin(r) / r$ with $\ve x = +# [x_0,x_1] \in\mathbb R^2$ and the radial distance +# $r=\sqrt{\ve x^\top\,\ve x}$, also known as "Mexican hat" function, which is a +# radial wave-like pattern which decays with distance from the center $\ve +# x=\ve 0$. We generate data by random sampling 2D points $\ve x_i$ and calculating +# $y_i = f(\ve x_i)$, optionally adding Gaussian noise further down. class MexicanHat: @@ -304,11 +317,15 @@ with torch.no_grad(): assert (post_pred_f.mean == post_pred_y.mean).all() # - -# When `use_noise=False`, then the GP's prediction is an almost pefect +# When `use_noise=False`, then the GP's prediction is an almost perfect # reconstruction of the ground truth function (in-distribution, so where we -# have data). +# have data). While 3D plots are fun, they are not optimal for judging how well +# the GP model represents the ground truth function. # # Plot difference to ground truth and uncertainty +# +# Let's use contour plots to visualize the difference between GP prediction and +# ground truth, as well as epistemic, total and aleatoric uncertainty. # + ncols = 4 @@ -432,7 +449,7 @@ print( # 0.2). # * out-of-distribution: `f_std` (epistemic) dominates -# Exercises +# # Exercises # # Go back up, switch on the settings for Exercise 2 and re-run the notebook. # Same with Exercise 3. diff --git a/BLcourse2.3/03_one_dim_SVI.py b/BLcourse2.3/03_one_dim_SVI.py index f708a7197a808741e6955e4761f2baa76315e7cd..86d70736f8de63c85fe79b40502c9cd450281f15 100644 --- a/BLcourse2.3/03_one_dim_SVI.py +++ b/BLcourse2.3/03_one_dim_SVI.py @@ -13,9 +13,12 @@ # name: bayes-ml-course # --- -# In this notebook, we replace the ExactGP inference by using SVI (stochastic -# variational inference). +# # About # +# In this notebook, we replace the ExactGP inference and log marginal +# likelihood optimization by using sparse stochastic variational inference. +# This serves as an example of the many methods `gpytorch` offers to make GPs +# scale to large data sets. # $\newcommand{\ve}[1]{\mathit{\boldsymbol{#1}}}$ # $\newcommand{\ma}[1]{\mathbf{#1}}$ # $\newcommand{\pred}[1]{\rm{#1}}$ @@ -43,16 +46,20 @@ from matplotlib import is_interactive import numpy as np from torch.utils.data import TensorDataset, DataLoader - from utils import extract_model_params, plot_samples torch.set_default_dtype(torch.float64) - torch.manual_seed(123) # - # # Generate toy 1D data +# +# Now we generate 10x more points as in the ExactGP case, still the inference +# won't be much slower (exact GPs scale roughly as $N^3$). Note that the data we +# use here is still tiny (1000 points is easy even for exact GPs), so the +# method's usefulness cannot be fully exploited in our example +# -- also we don't even use a GPU yet :). # + @@ -94,25 +101,42 @@ print(f"{X_pred.shape=}") # - # # Define GP model +# +# The model follows [this +# example](https://docs.gpytorch.ai/en/stable/examples/04_Variational_and_Approximate_GPs/SVGP_Regression_CUDA.html) +# based on [Hensman et al., "Scalable Variational Gaussian Process Classification", +# 2015](https://proceedings.mlr.press/v38/hensman15.html). The model is +# "sparse" since it works with a set of *inducing* points $(\ma Z, \ve u), +# \ve u=f(\ma Z)$ which is much smaller than the train data $(\ma X, \ve y)$. +# +# We have the same hyper parameters as before +# +# * $\ell$ = `model.covar_module.base_kernel.lengthscale` +# * $\sigma_n^2$ = `likelihood.noise_covar.noise` +# * $s$ = `model.covar_module.outputscale` +# * $m(\ve x) = c$ = `model.mean_module.constant` +# +# plus additional ones, introduced by the approximations used: +# +# * the learnable inducing points $\ma Z$ for the variational distribution +# $q_{\ve\psi}(\ve u)$ +# * learnable parameters of the variational distribution $q_{\ve\psi}(\ve u)=\mathcal N(\ve m_u, \ma S)$: the +# variational mean $\ve m_u$ and covariance $\ma S$ in form a lower triangular +# matrix $\ma L$ such that $\ma S=\ma L\,\ma L^\top$ + # + class ApproxGPModel(gpytorch.models.ApproximateGP): - """API: - - model.forward() prior f_pred - model() posterior f_pred - - likelihood(model.forward()) prior with noise y_pred - likelihood(model()) posterior with noise y_pred - """ - - def __init__(self, X_ind): + def __init__(self, Z): + # Approximate inducing value posterior q(u), u = f(Z), Z = inducing + # points (subset of X_train) variational_distribution = ( - gpytorch.variational.CholeskyVariationalDistribution(X_ind.size(0)) + gpytorch.variational.CholeskyVariationalDistribution(Z.size(0)) ) + # Compute q(f(X)) from q(u) variational_strategy = gpytorch.variational.VariationalStrategy( self, - X_ind, + Z, variational_distribution, learn_inducing_locations=False, ) @@ -123,7 +147,6 @@ class ApproxGPModel(gpytorch.models.ApproximateGP): ) def forward(self, x): - """The prior, defined in terms of the mean and covariance function.""" mean_x = self.mean_module(x) covar_x = self.covar_module(x) return gpytorch.distributions.MultivariateNormal(mean_x, covar_x) @@ -132,9 +155,10 @@ class ApproxGPModel(gpytorch.models.ApproximateGP): likelihood = gpytorch.likelihoods.GaussianLikelihood() n_train = len(X_train) +# Start values for inducing points Z, use 10% random sub-sample of X_train. ind_points_fraction = 0.1 -ind_idxs = torch.randperm(n_train)[:int(n_train * ind_points_fraction)] -model = ApproxGPModel(X_ind=X_train[ind_idxs]) +ind_idxs = torch.randperm(n_train)[: int(n_train * ind_points_fraction)] +model = ApproxGPModel(Z=X_train[ind_idxs]) # - @@ -157,14 +181,34 @@ model.mean_module.constant = 3.0 model.covar_module.base_kernel.lengthscale = 1.0 model.covar_module.outputscale = 1.0 likelihood.noise_covar.noise = 0.3 - -##print("model params:") -##pprint(extract_model_params(model, raw=False, try_item=False)) -##print("likelihood params:") -##pprint(extract_model_params(likelihood, raw=False, try_item=False)) # - # # Fit GP to data: optimize hyper params +# +# Now we optimize the GP hyper parameters by doing a GP-specific variational inference (VI), +# where we optimize not the log marginal likelihood (ExactGP case), +# but an ELBO (evidence lower bound) objective +# +# $$ +# \max_\ve\zeta\left(\mathbb E_{q_{\ve\psi}(\ve u)}\left[\ln p(\ve y|\ve u) \right] - +# D_{\text{KL}}(q_{\ve\psi}(\ve u)\Vert p(\ve u))\right) +# $$ +# +# with respect to +# +# $$\ve\zeta = [\ell, \sigma_n^2, s, c, \ve\psi] $$ +# +# with +# +# $$\ve\psi = [\ve m_u, \ma Z, \ma L]$$ +# +# the parameters of the variational distribution $q_{\ve\psi}(\ve u)$. +# +# In addition, we perform a stochastic +# optimization by using a deep learning type mini-batch loop, hence +# "stochastic" variational inference (SVI). The latter speeds up the +# optimization since we only look at a fraction of data per optimizer step to +# calculate an approximate loss gradient (`loss.backward()`). # + # Train mode