diff --git a/BLcourse2.3/01_simple_gp_regression.ipynb b/BLcourse2.3/01_simple_gp_regression.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..b6581706745e41359849d48b271ce9febbd37d58 --- /dev/null +++ b/BLcourse2.3/01_simple_gp_regression.ipynb @@ -0,0 +1,496 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "8bc26c14", + "metadata": {}, + "source": [ + "# Notation\n", + "$\\newcommand{\\ve}[1]{\\mathit{\\boldsymbol{#1}}}$\n", + "$\\newcommand{\\ma}[1]{\\mathbf{#1}}$\n", + "$\\newcommand{\\pred}[1]{\\widehat{#1}}$\n", + "$\\newcommand{\\cov}{\\mathrm{cov}}$\n", + "\n", + "Vector $\\ve a\\in\\mathbb R^n$ or $\\mathbb R^{n\\times 1}$, so \"column\" vector.\n", + "Matrix $\\ma A\\in\\mathbb R^{n\\times m}$. Design matrix with input vectors $\\ve\n", + "x_i\\in\\mathbb R^D$: $\\ma X = [\\ldots, \\ve x_i, \\ldots]^\\top \\in\\mathbb\n", + "R^{N\\times D}$.\n", + "\n", + "We use 1D data, so in fact $\\ma X \\in\\mathbb R^{N\\times 1}$ is a vector, but\n", + "we still denote the collection of all $\\ve x_i = x_i\\in\\mathbb R$ points with\n", + "$\\ma X$ to keep the notation consistent with the slides." + ] + }, + { + "cell_type": "markdown", + "id": "3ef36e32", + "metadata": {}, + "source": [ + "# Imports, helpers, setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c86d69b2", + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "\n", + "import math\n", + "from collections import defaultdict\n", + "from pprint import pprint\n", + "\n", + "import torch\n", + "import gpytorch\n", + "from matplotlib import pyplot as plt\n", + "from matplotlib import is_interactive\n", + "\n", + "\n", + "def extract_model_params(model, raw=False) -> dict:\n", + " \"\"\"Helper to convert model.named_parameters() to dict.\n", + "\n", + " With raw=True, use\n", + " foo.bar.raw_param\n", + " else\n", + " foo.bar.param\n", + "\n", + " See https://docs.gpytorch.ai/en/stable/examples/00_Basic_Usage/Hyperparameters.html#Raw-vs-Actual-Parameters\n", + " \"\"\"\n", + " if raw:\n", + " return dict(\n", + " (p_name, p_val.item())\n", + " for p_name, p_val in model.named_parameters()\n", + " )\n", + " else:\n", + " out = dict()\n", + " # p_name = 'covar_module.base_kernel.raw_lengthscale'. Access\n", + " # model.covar_module.base_kernel.lengthscale (w/o the raw_)\n", + " for p_name, p_val in model.named_parameters():\n", + " # Yes, eval() hack. Sorry.\n", + " p_name = p_name.replace(\".raw_\", \".\")\n", + " p_val = eval(f\"model.{p_name}\")\n", + " out[p_name] = p_val.item()\n", + " return out\n", + "\n", + "\n", + "def plot_samples(ax, X_pred, samples, label=None, **kwds):\n", + " plot_kwds = dict(color=\"tab:green\", alpha=0.3)\n", + " plot_kwds.update(kwds)\n", + "\n", + " if label is None:\n", + " ax.plot(X_pred, samples.T, **plot_kwds)\n", + " else:\n", + " ax.plot(X_pred, samples[0, :], **plot_kwds, label=label)\n", + " ax.plot(X_pred, samples[1:, :].T, **plot_kwds, label=\"_\")\n", + "\n", + "\n", + "# Default float32 results in slightly noisy prior samples. Less so with\n", + "# float64. We get a warning with both\n", + "# .../lib/python3.11/site-packages/linear_operator/utils/cholesky.py:40:\n", + "# NumericalWarning: A not p.d., added jitter of 1.0e-08 to the diagonal\n", + "# but the noise is smaller w/ float64. Reason must be that the `sample()`\n", + "# method [1] calls `rsample()` [2] which performs a Cholesky decomposition of\n", + "# the covariance matrix. The default in\n", + "# np.random.default_rng().multivariate_normal() is method=\"svd\", which is\n", + "# slower but seemingly a bit more stable.\n", + "#\n", + "# [1] https://docs.gpytorch.ai/en/stable/distributions.html#gpytorch.distributions.MultivariateNormal.sample\n", + "# [2] https://docs.gpytorch.ai/en/stable/distributions.html#gpytorch.distributions.MultivariateNormal.rsample\n", + "torch.set_default_dtype(torch.float64)\n", + "\n", + "torch.manual_seed(123)" + ] + }, + { + "cell_type": "markdown", + "id": "9c543940", + "metadata": { + "lines_to_next_cell": 2 + }, + "source": [ + "# Generate toy 1D data\n", + "\n", + "Here we generate noisy 1D data `X_train`, `y_train` as well as an extended\n", + "x-axis `X_pred` which we use later for prediction also outside of the data\n", + "range (extrapolation). The data has a constant offset `const` which we use to\n", + "test learning a GP mean function $m(\\ve x)$. We create a gap in the data to\n", + "show how the model uncertainty will behave there." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0674cc5a", + "metadata": {}, + "outputs": [], + "source": [ + "def generate_data(x, gaps=[[1, 3]], const=5):\n", + " y = torch.sin(x) * torch.exp(-0.2 * x) + torch.randn(x.shape) * 0.1 + const\n", + " msk = torch.tensor([True] * len(x))\n", + " if gaps is not None:\n", + " for g in gaps:\n", + " msk = msk & ~((x > g[0]) & (x < g[1]))\n", + " return x[msk], y[msk]\n", + "\n", + "\n", + "x = torch.linspace(0, 4 * math.pi, 100)\n", + "X_train, y_train = generate_data(x, gaps=[[6, 10]])\n", + "X_pred = torch.linspace(\n", + " X_train[0] - 2, X_train[-1] + 2, 200, requires_grad=False\n", + ")\n", + "\n", + "print(f\"{X_train.shape=}\")\n", + "print(f\"{y_train.shape=}\")\n", + "print(f\"{X_pred.shape=}\")\n", + "\n", + "plt.scatter(X_train, y_train, marker=\"o\", color=\"tab:blue\")" + ] + }, + { + "cell_type": "markdown", + "id": "e4ddc32f", + "metadata": { + "lines_to_next_cell": 2 + }, + "source": [ + "# Define GP model\n", + "\n", + "We define the simplest possible textbook GP model using a Gaussian\n", + "likelihood. The kernel is the squared exponential kernel with a scaling\n", + "factor.\n", + "\n", + "$$\\kappa(\\ve x_i, \\ve x_j) = \\sigma_f\\,\\exp\\left(-\\frac{\\lVert\\ve x_i - \\ve x_j\\rVert_2^2}{2\\,\\ell^2}\\right)$$\n", + "\n", + "This makes two hyper params, namely the length scale $\\ell$ and the scaling\n", + "$\\sigma_f$. The latter is implemented by wrapping the `RBFKernel` with\n", + "`ScaleKernel`.\n", + "\n", + "In addition, we define a constant mean via `ConstantMean`. Finally we have\n", + "the likelihood noise $\\sigma_n^2$. So in total we have 4 hyper params.\n", + "\n", + "* $\\ell$ = `model.covar_module.base_kernel.lengthscale`\n", + "* $\\sigma_n^2$ = `model.likelihood.noise_covar.noise`\n", + "* $\\sigma_f$ = `model.covar_module.outputscale`\n", + "* $m(\\ve x) = c$ = `model.mean_module.constant`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "93822ff7", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "class ExactGPModel(gpytorch.models.ExactGP):\n", + " \"\"\"API:\n", + "\n", + " model.forward() prior f_pred\n", + " model() posterior f_pred\n", + "\n", + " likelihood(model.forward()) prior with noise y_pred\n", + " likelihood(model()) posterior with noise y_pred\n", + " \"\"\"\n", + "\n", + " def __init__(self, X_train, y_train, likelihood):\n", + " super().__init__(X_train, y_train, likelihood)\n", + " self.mean_module = gpytorch.means.ConstantMean()\n", + " self.covar_module = gpytorch.kernels.ScaleKernel(\n", + " gpytorch.kernels.RBFKernel()\n", + " )\n", + "\n", + " def forward(self, x):\n", + " mean_x = self.mean_module(x)\n", + " covar_x = self.covar_module(x)\n", + " return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)\n", + "\n", + "\n", + "likelihood = gpytorch.likelihoods.GaussianLikelihood()\n", + "model = ExactGPModel(X_train, y_train, likelihood)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3288a7ca", + "metadata": {}, + "outputs": [], + "source": [ + "# Inspect the model\n", + "print(model)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9348a1db", + "metadata": {}, + "outputs": [], + "source": [ + "# Default start hyper params\n", + "pprint(extract_model_params(model, raw=False))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b2b5d326", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "# Set new start hyper params\n", + "model.mean_module.constant = 3.0\n", + "model.covar_module.base_kernel.lengthscale = 1.0\n", + "model.covar_module.outputscale = 1.0\n", + "model.likelihood.noise_covar.noise = 0.1\n", + "\n", + "pprint(extract_model_params(model, raw=False))" + ] + }, + { + "cell_type": "markdown", + "id": "c7309ee3", + "metadata": {}, + "source": [ + "# Sample from the GP prior\n", + "\n", + "We sample a number of functions $f_j, j=1,\\ldots,M$ from the GP prior and\n", + "evaluate them at all $\\ma X$ = `X_pred` points, of which we have $N'=200$. So\n", + "we effectively generate samples from $p(\\pred{\\ve y}|\\ma X) = \\mathcal N(\\ve\n", + "c, \\ma K)$. Each sampled vector $\\pred{\\ve y}\\in\\mathbb R^{N'}$ and the\n", + "covariance (kernel) matrix is $\\ma K\\in\\mathbb R^{N'\\times N'}$." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "56b26cd6", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "model.eval()\n", + "likelihood.eval()\n", + "\n", + "with torch.no_grad():\n", + " # Prior\n", + " M = 10\n", + " pri_f = model.forward(X_pred)\n", + " f_mean = pri_f.mean\n", + " f_std = pri_f.stddev\n", + " f_samples = pri_f.sample(sample_shape=torch.Size((M,)))\n", + " print(f\"{pri_f=}\")\n", + " print(f\"{pri_f.mean.shape=}\")\n", + " print(f\"{pri_f.covariance_matrix.shape=}\")\n", + " print(f\"{f_samples.shape=}\")\n", + " fig, ax = plt.subplots()\n", + " ax.plot(X_pred, f_mean, color=\"tab:red\", label=\"mean\", lw=2)\n", + " plot_samples(ax, X_pred, f_samples, label=\"prior samples\")\n", + " ax.fill_between(\n", + " X_pred,\n", + " f_mean - 2 * f_std,\n", + " f_mean + 2 * f_std,\n", + " color=\"tab:orange\",\n", + " alpha=0.2,\n", + " label=\"confidence\",\n", + " )\n", + " ax.legend()" + ] + }, + { + "cell_type": "markdown", + "id": "6758e47f", + "metadata": {}, + "source": [ + "Let's investigate the samples more closely. A constant mean $\\ve m(\\ma X) =\n", + "\\ve c$ does *not* mean that each sampled vector $\\pred{\\ve y}$'s mean is\n", + "equal to $c$. Instead, we have that at each $\\ve x_i$, the mean of\n", + "*all* sampled functions is the same, so $\\frac{1}{M}\\sum_{j=1}^M f_j(\\ve x_i)\n", + "\\approx c$ and for $M\\rightarrow\\infty$ it will be exactly $c$.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "69ae529a", + "metadata": {}, + "outputs": [], + "source": [ + "# Look at the first 20 x points from M=10 samples\n", + "print(f\"{f_samples.shape=}\")\n", + "print(f\"{f_samples.mean(axis=0)[:20]=}\")\n", + "print(f\"{f_samples.mean(axis=0).mean()=}\")\n", + "print(f\"{f_samples.mean(axis=0).std()=}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "df46aa3f", + "metadata": {}, + "outputs": [], + "source": [ + "# Take more samples, the means should get closer to c\n", + "f_samples = pri_f.sample(sample_shape=torch.Size((M * 200,)))\n", + "print(f\"{f_samples.shape=}\")\n", + "print(f\"{f_samples.mean(axis=0)[:20]=}\")\n", + "print(f\"{f_samples.mean(axis=0).mean()=}\")\n", + "print(f\"{f_samples.mean(axis=0).std()=}\")" + ] + }, + { + "cell_type": "markdown", + "id": "5a9350d8", + "metadata": {}, + "source": [ + "# Fit GP to data: optimize hyper params\n", + "\n", + "In each step of the optimizer, we condition on the training data (e.g. do\n", + "Bayesian inference) to calculate the weight posterior for the current values\n", + "of the hyper params.\n", + "\n", + "We use a simplistic PyTorch-style hand written train loop without convergence\n", + "control, so make sure to use enough `n_iter` and eyeball-check that the loss\n", + "is converged :-)\n", + "\n", + "Observe how all hyper params converge. In particular, note that the constant\n", + "mean $m(\\ve x)=c$ converges to the `const` value in `generate_data()`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "638d1244", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "# Train mode\n", + "model.train()\n", + "likelihood.train()\n", + "\n", + "optimizer = torch.optim.Adam(model.parameters(), lr=0.1)\n", + "loss_func = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)\n", + "\n", + "n_iter = 200\n", + "history = defaultdict(list)\n", + "for ii in range(n_iter):\n", + " optimizer.zero_grad()\n", + " loss = -loss_func(model(X_train), y_train)\n", + " loss.backward()\n", + " optimizer.step()\n", + " if (ii + 1) % 10 == 0:\n", + " print(f\"iter {ii+1}/{n_iter}, {loss=:.3f}\")\n", + " for p_name, p_val in extract_model_params(model).items():\n", + " history[p_name].append(p_val)\n", + " history[\"loss\"].append(loss.item())\n", + "\n", + "# Plot hyper params and loss (neg. log marginal likelihood) convergence\n", + "ncols = len(history)\n", + "fig, axs = plt.subplots(ncols=ncols, nrows=1, figsize=(ncols * 5, 5))\n", + "for ax, (p_name, p_lst) in zip(axs, history.items()):\n", + " ax.plot(p_lst)\n", + " ax.set_title(p_name)\n", + " ax.set_xlabel(\"iterations\")" + ] + }, + { + "cell_type": "markdown", + "id": "777b67d9", + "metadata": {}, + "source": [ + "# Run prediction\n", + "\n", + "We show \"noiseless\" (left: $\\sigma = \\sqrt{\\mathrm{diag}(\\ma\\Sigma)}$) vs.\n", + "\"noisy\" (right: $\\sigma = \\sqrt{\\mathrm{diag}(\\ma\\Sigma + \\sigma_n^2\\,\\ma\n", + "I_N)}$) predictions, where $\\ma\\Sigma\\equiv\\cov(\\ve f_*)$ is the posterior\n", + "predictive covariance matrix from R&W 2006 eq. 2.24 with $\\ma K = K(X,X)$,\n", + "$\\ma K'=K(X_*, X)$ and $\\ma K''=K(X_*, X_*)$, so\n", + "\n", + "$$\\ma\\Sigma = \\ma K'' - \\ma K'\\,(\\ma K+\\sigma_n^2\\,\\ma I)^{-1}\\,\\ma K'^\\top$$\n", + "\n", + "See\n", + "https://elcorto.github.io/gp_playground/content/gp_pred_comp/notebook_plot.html\n", + "for details." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "740fb21a", + "metadata": {}, + "outputs": [], + "source": [ + "# Evaluation (predictive posterior) mode\n", + "model.eval()\n", + "likelihood.eval()\n", + "\n", + "with torch.no_grad():\n", + " post_pred_f = model(X_pred)\n", + " post_pred_y = likelihood(model(X_pred))\n", + "\n", + " fig, axs = plt.subplots(ncols=2, figsize=(12, 5))\n", + " for ii, (ax, post_pred) in enumerate(zip(axs, [post_pred_f, post_pred_y])):\n", + " yf_mean = post_pred.mean\n", + " yf_samples = post_pred.sample(sample_shape=torch.Size((10,)))\n", + "\n", + " ##lower, upper = post_pred.confidence_region()\n", + " yf_std = post_pred.stddev\n", + " lower = yf_mean - 2 * yf_std\n", + " upper = yf_mean + 2 * yf_std\n", + " ax.plot(\n", + " X_train.numpy(),\n", + " y_train.numpy(),\n", + " \"o\",\n", + " label=\"data\",\n", + " color=\"tab:blue\",\n", + " )\n", + " ax.plot(\n", + " X_pred.numpy(),\n", + " yf_mean.numpy(),\n", + " label=\"mean\",\n", + " color=\"tab:red\",\n", + " lw=2,\n", + " )\n", + " ax.fill_between(\n", + " X_pred.numpy(),\n", + " lower.numpy(),\n", + " upper.numpy(),\n", + " label=\"confidence\",\n", + " color=\"tab:orange\",\n", + " alpha=0.3,\n", + " )\n", + " y_min = y_train.min()\n", + " y_max = y_train.max()\n", + " y_span = y_max - y_min\n", + " ax.set_ylim([y_min - 0.3 * y_span, y_max + 0.3 * y_span])\n", + " plot_samples(ax, X_pred, yf_samples, label=\"posterior pred. samples\")\n", + " if ii == 1:\n", + " ax.legend()\n", + "\n", + "# When running as script\n", + "if not is_interactive():\n", + " plt.show()" + ] + } + ], + "metadata": { + "jupytext": { + "formats": "ipynb,py:percent" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/BLcourse2.3/01_simple_gp_regression.py b/BLcourse2.3/01_simple_gp_regression.py new file mode 100644 index 0000000000000000000000000000000000000000..fcfcda544ec40c9be131cde8bb0937b0980f886c --- /dev/null +++ b/BLcourse2.3/01_simple_gp_regression.py @@ -0,0 +1,372 @@ +# --- +# jupyter: +# jupytext: +# formats: ipynb,py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.14.4 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% [markdown] +# # Notation +# $\newcommand{\ve}[1]{\mathit{\boldsymbol{#1}}}$ +# $\newcommand{\ma}[1]{\mathbf{#1}}$ +# $\newcommand{\pred}[1]{\widehat{#1}}$ +# $\newcommand{\cov}{\mathrm{cov}}$ +# +# Vector $\ve a\in\mathbb R^n$ or $\mathbb R^{n\times 1}$, so "column" vector. +# Matrix $\ma A\in\mathbb R^{n\times m}$. Design matrix with input vectors $\ve +# x_i\in\mathbb R^D$: $\ma X = [\ldots, \ve x_i, \ldots]^\top \in\mathbb +# R^{N\times D}$. +# +# We use 1D data, so in fact $\ma X \in\mathbb R^{N\times 1}$ is a vector, but +# we still denote the collection of all $\ve x_i = x_i\in\mathbb R$ points with +# $\ma X$ to keep the notation consistent with the slides. + +# %% [markdown] +# # Imports, helpers, setup + +# %% +# %matplotlib inline + +import math +from collections import defaultdict +from pprint import pprint + +import torch +import gpytorch +from matplotlib import pyplot as plt +from matplotlib import is_interactive + + +def extract_model_params(model, raw=False) -> dict: + """Helper to convert model.named_parameters() to dict. + + With raw=True, use + foo.bar.raw_param + else + foo.bar.param + + See https://docs.gpytorch.ai/en/stable/examples/00_Basic_Usage/Hyperparameters.html#Raw-vs-Actual-Parameters + """ + if raw: + return dict( + (p_name, p_val.item()) + for p_name, p_val in model.named_parameters() + ) + else: + out = dict() + # p_name = 'covar_module.base_kernel.raw_lengthscale'. Access + # model.covar_module.base_kernel.lengthscale (w/o the raw_) + for p_name, p_val in model.named_parameters(): + # Yes, eval() hack. Sorry. + p_name = p_name.replace(".raw_", ".") + p_val = eval(f"model.{p_name}") + out[p_name] = p_val.item() + return out + + +def plot_samples(ax, X_pred, samples, label=None, **kwds): + plot_kwds = dict(color="tab:green", alpha=0.3) + plot_kwds.update(kwds) + + if label is None: + ax.plot(X_pred, samples.T, **plot_kwds) + else: + ax.plot(X_pred, samples[0, :], **plot_kwds, label=label) + ax.plot(X_pred, samples[1:, :].T, **plot_kwds, label="_") + + +# Default float32 results in slightly noisy prior samples. Less so with +# float64. We get a warning with both +# .../lib/python3.11/site-packages/linear_operator/utils/cholesky.py:40: +# NumericalWarning: A not p.d., added jitter of 1.0e-08 to the diagonal +# but the noise is smaller w/ float64. Reason must be that the `sample()` +# method [1] calls `rsample()` [2] which performs a Cholesky decomposition of +# the covariance matrix. The default in +# np.random.default_rng().multivariate_normal() is method="svd", which is +# slower but seemingly a bit more stable. +# +# [1] https://docs.gpytorch.ai/en/stable/distributions.html#gpytorch.distributions.MultivariateNormal.sample +# [2] https://docs.gpytorch.ai/en/stable/distributions.html#gpytorch.distributions.MultivariateNormal.rsample +torch.set_default_dtype(torch.float64) + +torch.manual_seed(123) + +# %% [markdown] +# # Generate toy 1D data +# +# Here we generate noisy 1D data `X_train`, `y_train` as well as an extended +# x-axis `X_pred` which we use later for prediction also outside of the data +# range (extrapolation). The data has a constant offset `const` which we use to +# test learning a GP mean function $m(\ve x)$. We create a gap in the data to +# show how the model uncertainty will behave there. + + +# %% +def generate_data(x, gaps=[[1, 3]], const=5): + y = torch.sin(x) * torch.exp(-0.2 * x) + torch.randn(x.shape) * 0.1 + const + msk = torch.tensor([True] * len(x)) + if gaps is not None: + for g in gaps: + msk = msk & ~((x > g[0]) & (x < g[1])) + return x[msk], y[msk] + + +x = torch.linspace(0, 4 * math.pi, 100) +X_train, y_train = generate_data(x, gaps=[[6, 10]]) +X_pred = torch.linspace( + X_train[0] - 2, X_train[-1] + 2, 200, requires_grad=False +) + +print(f"{X_train.shape=}") +print(f"{y_train.shape=}") +print(f"{X_pred.shape=}") + +plt.scatter(X_train, y_train, marker="o", color="tab:blue") + +# %% [markdown] +# # Define GP model +# +# We define the simplest possible textbook GP model using a Gaussian +# likelihood. The kernel is the squared exponential kernel with a scaling +# factor. +# +# $$\kappa(\ve x_i, \ve x_j) = \sigma_f\,\exp\left(-\frac{\lVert\ve x_i - \ve x_j\rVert_2^2}{2\,\ell^2}\right)$$ +# +# This makes two hyper params, namely the length scale $\ell$ and the scaling +# $\sigma_f$. The latter is implemented by wrapping the `RBFKernel` with +# `ScaleKernel`. +# +# In addition, we define a constant mean via `ConstantMean`. Finally we have +# the likelihood noise $\sigma_n^2$. So in total we have 4 hyper params. +# +# * $\ell$ = `model.covar_module.base_kernel.lengthscale` +# * $\sigma_n^2$ = `model.likelihood.noise_covar.noise` +# * $\sigma_f$ = `model.covar_module.outputscale` +# * $m(\ve x) = c$ = `model.mean_module.constant` + + +# %% +class ExactGPModel(gpytorch.models.ExactGP): + """API: + + model.forward() prior f_pred + model() posterior f_pred + + likelihood(model.forward()) prior with noise y_pred + likelihood(model()) posterior with noise y_pred + """ + + def __init__(self, X_train, y_train, likelihood): + super().__init__(X_train, y_train, likelihood) + self.mean_module = gpytorch.means.ConstantMean() + self.covar_module = gpytorch.kernels.ScaleKernel( + gpytorch.kernels.RBFKernel() + ) + + def forward(self, x): + mean_x = self.mean_module(x) + covar_x = self.covar_module(x) + return gpytorch.distributions.MultivariateNormal(mean_x, covar_x) + + +likelihood = gpytorch.likelihoods.GaussianLikelihood() +model = ExactGPModel(X_train, y_train, likelihood) + + +# %% +# Inspect the model +print(model) + +# %% +# Default start hyper params +pprint(extract_model_params(model, raw=False)) + +# %% +# Set new start hyper params +model.mean_module.constant = 3.0 +model.covar_module.base_kernel.lengthscale = 1.0 +model.covar_module.outputscale = 1.0 +model.likelihood.noise_covar.noise = 0.1 + +pprint(extract_model_params(model, raw=False)) + + +# %% [markdown] +# # Sample from the GP prior +# +# We sample a number of functions $f_j, j=1,\ldots,M$ from the GP prior and +# evaluate them at all $\ma X$ = `X_pred` points, of which we have $N'=200$. So +# we effectively generate samples from $p(\pred{\ve y}|\ma X) = \mathcal N(\ve +# c, \ma K)$. Each sampled vector $\pred{\ve y}\in\mathbb R^{N'}$ and the +# covariance (kernel) matrix is $\ma K\in\mathbb R^{N'\times N'}$. + +# %% +model.eval() +likelihood.eval() + +with torch.no_grad(): + # Prior + M = 10 + pri_f = model.forward(X_pred) + f_mean = pri_f.mean + f_std = pri_f.stddev + f_samples = pri_f.sample(sample_shape=torch.Size((M,))) + print(f"{pri_f=}") + print(f"{pri_f.mean.shape=}") + print(f"{pri_f.covariance_matrix.shape=}") + print(f"{f_samples.shape=}") + fig, ax = plt.subplots() + ax.plot(X_pred, f_mean, color="tab:red", label="mean", lw=2) + plot_samples(ax, X_pred, f_samples, label="prior samples") + ax.fill_between( + X_pred, + f_mean - 2 * f_std, + f_mean + 2 * f_std, + color="tab:orange", + alpha=0.2, + label="confidence", + ) + ax.legend() + + +# %% [markdown] +# Let's investigate the samples more closely. A constant mean $\ve m(\ma X) = +# \ve c$ does *not* mean that each sampled vector $\pred{\ve y}$'s mean is +# equal to $c$. Instead, we have that at each $\ve x_i$, the mean of +# *all* sampled functions is the same, so $\frac{1}{M}\sum_{j=1}^M f_j(\ve x_i) +# \approx c$ and for $M\rightarrow\infty$ it will be exactly $c$. +# + +# %% +# Look at the first 20 x points from M=10 samples +print(f"{f_samples.shape=}") +print(f"{f_samples.mean(axis=0)[:20]=}") +print(f"{f_samples.mean(axis=0).mean()=}") +print(f"{f_samples.mean(axis=0).std()=}") + +# %% +# Take more samples, the means should get closer to c +f_samples = pri_f.sample(sample_shape=torch.Size((M * 200,))) +print(f"{f_samples.shape=}") +print(f"{f_samples.mean(axis=0)[:20]=}") +print(f"{f_samples.mean(axis=0).mean()=}") +print(f"{f_samples.mean(axis=0).std()=}") + +# %% [markdown] +# # Fit GP to data: optimize hyper params +# +# In each step of the optimizer, we condition on the training data (e.g. do +# Bayesian inference) to calculate the weight posterior for the current values +# of the hyper params. +# +# We use a simplistic PyTorch-style hand written train loop without convergence +# control, so make sure to use enough `n_iter` and eyeball-check that the loss +# is converged :-) +# +# Observe how all hyper params converge. In particular, note that the constant +# mean $m(\ve x)=c$ converges to the `const` value in `generate_data()`. + +# %% +# Train mode +model.train() +likelihood.train() + +optimizer = torch.optim.Adam(model.parameters(), lr=0.1) +loss_func = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model) + +n_iter = 200 +history = defaultdict(list) +for ii in range(n_iter): + optimizer.zero_grad() + loss = -loss_func(model(X_train), y_train) + loss.backward() + optimizer.step() + if (ii + 1) % 10 == 0: + print(f"iter {ii+1}/{n_iter}, {loss=:.3f}") + for p_name, p_val in extract_model_params(model).items(): + history[p_name].append(p_val) + history["loss"].append(loss.item()) + +# Plot hyper params and loss (neg. log marginal likelihood) convergence +ncols = len(history) +fig, axs = plt.subplots(ncols=ncols, nrows=1, figsize=(ncols * 5, 5)) +for ax, (p_name, p_lst) in zip(axs, history.items()): + ax.plot(p_lst) + ax.set_title(p_name) + ax.set_xlabel("iterations") + + +# %% [markdown] +# # Run prediction + +# We show "noiseless" (left: $\sigma = \sqrt{\mathrm{diag}(\ma\Sigma)}$) vs. +# "noisy" (right: $\sigma = \sqrt{\mathrm{diag}(\ma\Sigma + \sigma_n^2\,\ma +# I_N)}$) predictions, where $\ma\Sigma\equiv\cov(\ve f_*)$ is the posterior +# predictive covariance matrix from R&W 2006 eq. 2.24 with $\ma K = K(X,X)$, +# $\ma K'=K(X_*, X)$ and $\ma K''=K(X_*, X_*)$, so +# +# $$\ma\Sigma = \ma K'' - \ma K'\,(\ma K+\sigma_n^2\,\ma I)^{-1}\,\ma K'^\top$$ +# +# See +# https://elcorto.github.io/gp_playground/content/gp_pred_comp/notebook_plot.html +# for details. + +# %% +# Evaluation (predictive posterior) mode +model.eval() +likelihood.eval() + +with torch.no_grad(): + post_pred_f = model(X_pred) + post_pred_y = likelihood(model(X_pred)) + + fig, axs = plt.subplots(ncols=2, figsize=(12, 5)) + for ii, (ax, post_pred) in enumerate(zip(axs, [post_pred_f, post_pred_y])): + yf_mean = post_pred.mean + yf_samples = post_pred.sample(sample_shape=torch.Size((10,))) + + ##lower, upper = post_pred.confidence_region() + yf_std = post_pred.stddev + lower = yf_mean - 2 * yf_std + upper = yf_mean + 2 * yf_std + ax.plot( + X_train.numpy(), + y_train.numpy(), + "o", + label="data", + color="tab:blue", + ) + ax.plot( + X_pred.numpy(), + yf_mean.numpy(), + label="mean", + color="tab:red", + lw=2, + ) + ax.fill_between( + X_pred.numpy(), + lower.numpy(), + upper.numpy(), + label="confidence", + color="tab:orange", + alpha=0.3, + ) + y_min = y_train.min() + y_max = y_train.max() + y_span = y_max - y_min + ax.set_ylim([y_min - 0.3 * y_span, y_max + 0.3 * y_span]) + plot_samples(ax, X_pred, yf_samples, label="posterior pred. samples") + if ii == 1: + ax.legend() + +# When running as script +if not is_interactive(): + plt.show() diff --git a/BLcourse2.3/README.md b/BLcourse2.3/README.md new file mode 100644 index 0000000000000000000000000000000000000000..21766d2c861fc568cce33e935241b908c3ce8938 --- /dev/null +++ b/BLcourse2.3/README.md @@ -0,0 +1,11 @@ +Based on https://docs.gpytorch.ai/en/stable/examples/01_Exact_GPs/Simple_GP_Regression.html + +Use https://jupytext.readthedocs.io to create a notebook. + +```sh +$ jupytext --to ipynb notebook.py +$ jupyter-notebook notebook.ipynb +``` + +For convenience the ipynb file is included. Please keep it in sync with the py +file using jupytext.