diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..dcf3a58f824497e4f1f28539afea96c1b2bb81a6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.ipynb_checkpoints/ +__pycache__ diff --git a/BLcourse2.3/01_one_dim.ipynb b/BLcourse2.3/01_one_dim.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..f5208de38e0c4b0cc1a1ed052d6d9393a318aa04 --- /dev/null +++ b/BLcourse2.3/01_one_dim.ipynb @@ -0,0 +1,590 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "2a8ed8ae", + "metadata": {}, + "source": [ + "# Notation\n", + "$\\newcommand{\\ve}[1]{\\mathit{\\boldsymbol{#1}}}$\n", + "$\\newcommand{\\ma}[1]{\\mathbf{#1}}$\n", + "$\\newcommand{\\pred}[1]{\\rm{#1}}$\n", + "$\\newcommand{\\predve}[1]{\\mathbf{#1}}$\n", + "$\\newcommand{\\cov}{\\mathrm{cov}}$\n", + "$\\newcommand{\\test}[1]{#1_*}$\n", + "$\\newcommand{\\testtest}[1]{#1_{**}}$\n", + "\n", + "Vector $\\ve a\\in\\mathbb R^n$ or $\\mathbb R^{n\\times 1}$, so \"column\" vector.\n", + "Matrix $\\ma A\\in\\mathbb R^{n\\times m}$. Design matrix with input vectors $\\ve\n", + "x_i\\in\\mathbb R^D$: $\\ma X = [\\ldots, \\ve x_i, \\ldots]^\\top \\in\\mathbb\n", + "R^{N\\times D}$.\n", + "\n", + "We use 1D data, so in fact $\\ma X \\in\\mathbb R^{N\\times 1}$ is a vector, but\n", + "we still denote the collection of all $\\ve x_i = x_i\\in\\mathbb R$ points with\n", + "$\\ma X$ to keep the notation consistent with the slides." + ] + }, + { + "cell_type": "markdown", + "id": "ae440984", + "metadata": {}, + "source": [ + "# Imports, helpers, setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "25b39064", + "metadata": {}, + "outputs": [], + "source": [ + "##%matplotlib notebook\n", + "##%matplotlib widget\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "960de85c", + "metadata": {}, + "outputs": [], + "source": [ + "import math\n", + "from collections import defaultdict\n", + "from pprint import pprint\n", + "\n", + "import torch\n", + "import gpytorch\n", + "from matplotlib import pyplot as plt\n", + "from matplotlib import is_interactive\n", + "\n", + "from utils import extract_model_params, plot_samples\n", + "\n", + "\n", + "# Default float32 results in slightly noisy prior samples. Less so with\n", + "# float64. We get a warning with both\n", + "# .../lib/python3.11/site-packages/linear_operator/utils/cholesky.py:40:\n", + "# NumericalWarning: A not p.d., added jitter of 1.0e-08 to the diagonal\n", + "# but the noise is smaller w/ float64. Reason must be that the `sample()`\n", + "# method [1] calls `rsample()` [2] which performs a Cholesky decomposition of\n", + "# the covariance matrix. The default in\n", + "# np.random.default_rng().multivariate_normal() is method=\"svd\", which is\n", + "# slower but seemingly a bit more stable.\n", + "#\n", + "# [1] https://docs.gpytorch.ai/en/stable/distributions.html#gpytorch.distributions.MultivariateNormal.sample\n", + "# [2] https://docs.gpytorch.ai/en/stable/distributions.html#gpytorch.distributions.MultivariateNormal.rsample\n", + "torch.set_default_dtype(torch.float64)\n", + "\n", + "torch.manual_seed(123)" + ] + }, + { + "cell_type": "markdown", + "id": "7a41bb4f", + "metadata": { + "lines_to_next_cell": 2 + }, + "source": [ + "# Generate toy 1D data\n", + "\n", + "Here we generate noisy 1D data `X_train`, `y_train` as well as an extended\n", + "x-axis `X_pred` which we use later for prediction also outside of the data\n", + "range (extrapolation). The data has a constant offset `const` which we use to\n", + "test learning a GP mean function $m(\\ve x)$. We create a gap in the data to\n", + "show how the model uncertainty will behave there." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "30019345", + "metadata": {}, + "outputs": [], + "source": [ + "def generate_data(x, gaps=[[1, 3]], const=5):\n", + " y = torch.sin(x) * torch.exp(-0.2 * x) + torch.randn(x.shape) * 0.1 + const\n", + " msk = torch.tensor([True] * len(x))\n", + " if gaps is not None:\n", + " for g in gaps:\n", + " msk = msk & ~((x > g[0]) & (x < g[1]))\n", + " return x[msk], y[msk]\n", + "\n", + "\n", + "x = torch.linspace(0, 4 * math.pi, 100)\n", + "X_train, y_train = generate_data(x, gaps=[[6, 10]])\n", + "X_pred = torch.linspace(\n", + " X_train[0] - 2, X_train[-1] + 2, 200, requires_grad=False\n", + ")\n", + "\n", + "print(f\"{X_train.shape=}\")\n", + "print(f\"{y_train.shape=}\")\n", + "print(f\"{X_pred.shape=}\")\n", + "\n", + "plt.scatter(X_train, y_train, marker=\"o\", color=\"tab:blue\")" + ] + }, + { + "cell_type": "markdown", + "id": "7d326cde", + "metadata": { + "lines_to_next_cell": 2 + }, + "source": [ + "# Define GP model\n", + "\n", + "We define the simplest possible textbook GP model using a Gaussian\n", + "likelihood. The kernel is the squared exponential kernel with a scaling\n", + "factor.\n", + "\n", + "$$\\kappa(\\ve x_i, \\ve x_j) = s\\,\\exp\\left(-\\frac{\\lVert\\ve x_i - \\ve x_j\\rVert_2^2}{2\\,\\ell^2}\\right)$$\n", + "\n", + "This makes two hyper params, namely the length scale $\\ell$ and the scaling\n", + "$s$. The latter is implemented by wrapping the `RBFKernel` with\n", + "`ScaleKernel`.\n", + "\n", + "In addition, we define a constant mean via `ConstantMean`. Finally we have\n", + "the likelihood noise $\\sigma_n^2$. So in total we have 4 hyper params.\n", + "\n", + "* $\\ell$ = `model.covar_module.base_kernel.lengthscale`\n", + "* $\\sigma_n^2$ = `model.likelihood.noise_covar.noise`\n", + "* $s$ = `model.covar_module.outputscale`\n", + "* $m(\\ve x) = c$ = `model.mean_module.constant`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0ce2844b", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "class ExactGPModel(gpytorch.models.ExactGP):\n", + " \"\"\"API:\n", + "\n", + " model.forward() prior f_pred\n", + " model() posterior f_pred\n", + "\n", + " likelihood(model.forward()) prior with noise y_pred\n", + " likelihood(model()) posterior with noise y_pred\n", + " \"\"\"\n", + "\n", + " def __init__(self, X_train, y_train, likelihood):\n", + " super().__init__(X_train, y_train, likelihood)\n", + " self.mean_module = gpytorch.means.ConstantMean()\n", + " self.covar_module = gpytorch.kernels.ScaleKernel(\n", + " gpytorch.kernels.RBFKernel()\n", + " )\n", + "\n", + " def forward(self, x):\n", + " \"\"\"The prior, defined in terms of the mean and covariance function.\"\"\"\n", + " mean_x = self.mean_module(x)\n", + " covar_x = self.covar_module(x)\n", + " return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)\n", + "\n", + "\n", + "likelihood = gpytorch.likelihoods.GaussianLikelihood()\n", + "model = ExactGPModel(X_train, y_train, likelihood)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d8a2aab8", + "metadata": {}, + "outputs": [], + "source": [ + "# Inspect the model\n", + "print(model)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0a3989c5", + "metadata": {}, + "outputs": [], + "source": [ + "# Default start hyper params\n", + "pprint(extract_model_params(model, raw=False))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5c3808b3", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "# Set new start hyper params\n", + "model.mean_module.constant = 3.0\n", + "model.covar_module.base_kernel.lengthscale = 1.0\n", + "model.covar_module.outputscale = 1.0\n", + "model.likelihood.noise_covar.noise = 0.1\n", + "\n", + "pprint(extract_model_params(model, raw=False))" + ] + }, + { + "cell_type": "markdown", + "id": "c3600af3", + "metadata": {}, + "source": [ + "# Sample from the GP prior\n", + "\n", + "We sample a number of functions $f_j, j=1,\\ldots,M$ from the GP prior and\n", + "evaluate them at all $\\ma X$ = `X_pred` points, of which we have $N=200$. So\n", + "we effectively generate samples from $p(\\predve f|\\ma X) = \\mathcal N(\\ve\n", + "c, \\ma K)$. Each sampled vector $\\predve f\\in\\mathbb R^{N}$ and the\n", + "covariance (kernel) matrix is $\\ma K\\in\\mathbb R^{N\\times N}$." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e74b8d1b", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "model.eval()\n", + "likelihood.eval()\n", + "\n", + "with torch.no_grad():\n", + " # Prior\n", + " M = 10\n", + " pri_f = model.forward(X_pred)\n", + " f_mean = pri_f.mean\n", + " f_std = pri_f.stddev\n", + " f_samples = pri_f.sample(sample_shape=torch.Size((M,)))\n", + " print(f\"{pri_f=}\")\n", + " print(f\"{pri_f.mean.shape=}\")\n", + " print(f\"{pri_f.covariance_matrix.shape=}\")\n", + " print(f\"{f_samples.shape=}\")\n", + " fig, ax = plt.subplots()\n", + " ax.plot(X_pred, f_mean, color=\"tab:red\", label=\"mean\", lw=2)\n", + " plot_samples(ax, X_pred, f_samples, label=\"prior samples\")\n", + " ax.fill_between(\n", + " X_pred,\n", + " f_mean - 2 * f_std,\n", + " f_mean + 2 * f_std,\n", + " color=\"tab:orange\",\n", + " alpha=0.2,\n", + " label=\"confidence\",\n", + " )\n", + " ax.legend()" + ] + }, + { + "cell_type": "markdown", + "id": "0598b567", + "metadata": {}, + "source": [ + "Let's investigate the samples more closely. A constant mean $\\ve m(\\ma X) =\n", + "\\ve c$ does *not* mean that each sampled vector $\\predve f$'s mean is\n", + "equal to $c$. Instead, we have that at each $\\ve x_i$, the mean of\n", + "*all* sampled functions is the same, so $\\frac{1}{M}\\sum_{j=1}^M f_j(\\ve x_i)\n", + "\\approx c$ and for $M\\rightarrow\\infty$ it will be exactly $c$.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d7fa03e", + "metadata": {}, + "outputs": [], + "source": [ + "# Look at the first 20 x points from M=10 samples\n", + "print(f\"{f_samples.shape=}\")\n", + "print(f\"{f_samples.mean(axis=0)[:20]=}\")\n", + "print(f\"{f_samples.mean(axis=0).mean()=}\")\n", + "print(f\"{f_samples.mean(axis=0).std()=}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "20b6e415", + "metadata": {}, + "outputs": [], + "source": [ + "# Take more samples, the means should get closer to c\n", + "f_samples = pri_f.sample(sample_shape=torch.Size((M * 200,)))\n", + "print(f\"{f_samples.shape=}\")\n", + "print(f\"{f_samples.mean(axis=0)[:20]=}\")\n", + "print(f\"{f_samples.mean(axis=0).mean()=}\")\n", + "print(f\"{f_samples.mean(axis=0).std()=}\")" + ] + }, + { + "cell_type": "markdown", + "id": "25b90532", + "metadata": {}, + "source": [ + "# GP posterior predictive distribution with fixed hyper params\n", + "\n", + "Now we calculate the posterior predictive distribution $p(\\test{\\predve\n", + "f}|\\test{\\ma X}, \\ma X, \\ve y)$, i.e. we condition on the train data (Bayesian\n", + "inference).\n", + "\n", + "We use the fixed hyper param values defined above. In particular, since\n", + "$\\sigma_n^2$ = `model.likelihood.noise_covar.noise` is > 0, we have a\n", + "regression setting." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d01951f1", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "# Evaluation (predictive posterior) mode\n", + "model.eval()\n", + "likelihood.eval()\n", + "\n", + "with torch.no_grad():\n", + " M = 10\n", + " post_pred_f = model(X_pred)\n", + "\n", + " fig, ax = plt.subplots()\n", + " f_mean = post_pred_f.mean\n", + " f_samples = post_pred_f.sample(sample_shape=torch.Size((M,)))\n", + " f_std = post_pred_f.stddev\n", + " lower = f_mean - 2 * f_std\n", + " upper = f_mean + 2 * f_std\n", + " ax.plot(\n", + " X_train.numpy(),\n", + " y_train.numpy(),\n", + " \"o\",\n", + " label=\"data\",\n", + " color=\"tab:blue\",\n", + " )\n", + " ax.plot(\n", + " X_pred.numpy(),\n", + " f_mean.numpy(),\n", + " label=\"mean\",\n", + " color=\"tab:red\",\n", + " lw=2,\n", + " )\n", + " ax.fill_between(\n", + " X_pred.numpy(),\n", + " lower.numpy(),\n", + " upper.numpy(),\n", + " label=\"confidence\",\n", + " color=\"tab:orange\",\n", + " alpha=0.3,\n", + " )\n", + " y_min = y_train.min()\n", + " y_max = y_train.max()\n", + " y_span = y_max - y_min\n", + " ax.set_ylim([y_min - 0.3 * y_span, y_max + 0.3 * y_span])\n", + " plot_samples(ax, X_pred, f_samples, label=\"posterior pred. samples\")\n", + " ax.legend()" + ] + }, + { + "cell_type": "markdown", + "id": "b960a745", + "metadata": {}, + "source": [ + "# Fit GP to data: optimize hyper params\n", + "\n", + "In each step of the optimizer, we condition on the training data (e.g. do\n", + "Bayesian inference) to calculate the posterior predictive distribution for\n", + "the current values of the hyper params. We iterate until the log marginal\n", + "likelihood is converged.\n", + "\n", + "We use a simplistic PyTorch-style hand written train loop without convergence\n", + "control, so make sure to use enough `n_iter` and eyeball-check that the loss\n", + "is converged :-)\n", + "\n", + "Observe how all hyper params converge. In particular, note that the constant\n", + "mean $m(\\ve x)=c$ converges to the `const` value in `generate_data()`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dc059dc2", + "metadata": {}, + "outputs": [], + "source": [ + "# Train mode\n", + "model.train()\n", + "likelihood.train()\n", + "\n", + "optimizer = torch.optim.Adam(model.parameters(), lr=0.1)\n", + "loss_func = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)\n", + "\n", + "n_iter = 200\n", + "history = defaultdict(list)\n", + "for ii in range(n_iter):\n", + " optimizer.zero_grad()\n", + " loss = -loss_func(model(X_train), y_train)\n", + " loss.backward()\n", + " optimizer.step()\n", + " if (ii + 1) % 10 == 0:\n", + " print(f\"iter {ii+1}/{n_iter}, {loss=:.3f}\")\n", + " for p_name, p_val in extract_model_params(model).items():\n", + " history[p_name].append(p_val)\n", + " history[\"loss\"].append(loss.item())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c9e7052", + "metadata": {}, + "outputs": [], + "source": [ + "# Plot hyper params and loss (neg. log marginal likelihood) convergence\n", + "ncols = len(history)\n", + "fig, axs = plt.subplots(ncols=ncols, nrows=1, figsize=(ncols * 5, 5))\n", + "for ax, (p_name, p_lst) in zip(axs, history.items()):\n", + " ax.plot(p_lst)\n", + " ax.set_title(p_name)\n", + " ax.set_xlabel(\"iterations\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eb1fe908", + "metadata": {}, + "outputs": [], + "source": [ + "# Values of optimized hyper params\n", + "pprint(extract_model_params(model, raw=False))" + ] + }, + { + "cell_type": "markdown", + "id": "98aefb90", + "metadata": {}, + "source": [ + "# Run prediction\n", + "\n", + "We show \"noiseless\" (left: $\\sigma = \\sqrt{\\mathrm{diag}(\\ma\\Sigma)}$) vs.\n", + "\"noisy\" (right: $\\sigma = \\sqrt{\\mathrm{diag}(\\ma\\Sigma + \\sigma_n^2\\,\\ma\n", + "I_N)}$) predictions with\n", + "\n", + "$$\\ma\\Sigma = \\testtest{\\ma K} - \\test{\\ma K}\\,(\\ma K+\\sigma_n^2\\,\\ma I)^{-1}\\,\\test{\\ma K}^\\top$$\n", + "\n", + "We find that $\\ma\\Sigma$ reflects behavior we would like to see from\n", + "epistemic uncertainty -- it is high when we have no data\n", + "(out-of-distribution). But this alone isn't the whole story. We need to add\n", + "the estimated noise level $\\sigma_n^2$ in order for the confidence band to\n", + "cover the data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a78de0e4", + "metadata": {}, + "outputs": [], + "source": [ + "# Evaluation (predictive posterior) mode\n", + "model.eval()\n", + "likelihood.eval()\n", + "\n", + "with torch.no_grad():\n", + " M = 10\n", + " post_pred_f = model(X_pred)\n", + " post_pred_y = likelihood(model(X_pred))\n", + "\n", + " fig, axs = plt.subplots(ncols=2, figsize=(12, 5))\n", + " fig_sigmas, ax_sigmas = plt.subplots()\n", + " for ii, (ax, post_pred, name) in enumerate(\n", + " zip(axs, [post_pred_f, post_pred_y], [\"f\", \"y\"])\n", + " ):\n", + " yf_mean = post_pred.mean\n", + " yf_samples = post_pred.sample(sample_shape=torch.Size((M,)))\n", + "\n", + " yf_std = post_pred.stddev\n", + " lower = yf_mean - 2 * yf_std\n", + " upper = yf_mean + 2 * yf_std\n", + " ax.plot(\n", + " X_train.numpy(),\n", + " y_train.numpy(),\n", + " \"o\",\n", + " label=\"data\",\n", + " color=\"tab:blue\",\n", + " )\n", + " ax.plot(\n", + " X_pred.numpy(),\n", + " yf_mean.numpy(),\n", + " label=\"mean\",\n", + " color=\"tab:red\",\n", + " lw=2,\n", + " )\n", + " ax.fill_between(\n", + " X_pred.numpy(),\n", + " lower.numpy(),\n", + " upper.numpy(),\n", + " label=\"confidence\",\n", + " color=\"tab:orange\",\n", + " alpha=0.3,\n", + " )\n", + " if name == \"f\":\n", + " sigma_label = r\"$\\pm 2\\sqrt{\\mathrm{diag}(\\Sigma)}$\"\n", + " zorder = 1\n", + " else:\n", + " sigma_label = (\n", + " r\"$\\pm 2\\sqrt{\\mathrm{diag}(\\Sigma + \\sigma_n^2\\,I)}$\"\n", + " )\n", + " zorder = 0\n", + " ax_sigmas.fill_between(\n", + " X_pred.numpy(),\n", + " lower.numpy(),\n", + " upper.numpy(),\n", + " label=\"confidence \" + sigma_label,\n", + " color=\"tab:orange\" if name == \"f\" else \"tab:blue\",\n", + " alpha=0.5,\n", + " zorder=zorder,\n", + " )\n", + " y_min = y_train.min()\n", + " y_max = y_train.max()\n", + " y_span = y_max - y_min\n", + " ax.set_ylim([y_min - 0.3 * y_span, y_max + 0.3 * y_span])\n", + " plot_samples(ax, X_pred, yf_samples, label=\"posterior pred. samples\")\n", + " if ii == 1:\n", + " ax.legend()\n", + " ax_sigmas.legend()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e726d3d1", + "metadata": {}, + "outputs": [], + "source": [ + "# When running as script\n", + "if not is_interactive():\n", + " plt.show()" + ] + } + ], + "metadata": { + "jupytext": { + "formats": "ipynb,py:light" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/BLcourse2.3/01_one_dim.py b/BLcourse2.3/01_one_dim.py new file mode 100644 index 0000000000000000000000000000000000000000..e9e55f233fc306eb2da38ca98ce710d5a359daa0 --- /dev/null +++ b/BLcourse2.3/01_one_dim.py @@ -0,0 +1,419 @@ +# --- +# jupyter: +# jupytext: +# formats: ipynb,py:light +# text_representation: +# extension: .py +# format_name: light +# format_version: '1.5' +# jupytext_version: 1.16.2 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# # Notation +# $\newcommand{\ve}[1]{\mathit{\boldsymbol{#1}}}$ +# $\newcommand{\ma}[1]{\mathbf{#1}}$ +# $\newcommand{\pred}[1]{\rm{#1}}$ +# $\newcommand{\predve}[1]{\mathbf{#1}}$ +# $\newcommand{\cov}{\mathrm{cov}}$ +# $\newcommand{\test}[1]{#1_*}$ +# $\newcommand{\testtest}[1]{#1_{**}}$ +# +# Vector $\ve a\in\mathbb R^n$ or $\mathbb R^{n\times 1}$, so "column" vector. +# Matrix $\ma A\in\mathbb R^{n\times m}$. Design matrix with input vectors $\ve +# x_i\in\mathbb R^D$: $\ma X = [\ldots, \ve x_i, \ldots]^\top \in\mathbb +# R^{N\times D}$. +# +# We use 1D data, so in fact $\ma X \in\mathbb R^{N\times 1}$ is a vector, but +# we still denote the collection of all $\ve x_i = x_i\in\mathbb R$ points with +# $\ma X$ to keep the notation consistent with the slides. + +# # Imports, helpers, setup + +# ##%matplotlib notebook +# ##%matplotlib widget +# %matplotlib inline + +# + +import math +from collections import defaultdict +from pprint import pprint + +import torch +import gpytorch +from matplotlib import pyplot as plt +from matplotlib import is_interactive + +from utils import extract_model_params, plot_samples + + +# Default float32 results in slightly noisy prior samples. Less so with +# float64. We get a warning with both +# .../lib/python3.11/site-packages/linear_operator/utils/cholesky.py:40: +# NumericalWarning: A not p.d., added jitter of 1.0e-08 to the diagonal +# but the noise is smaller w/ float64. Reason must be that the `sample()` +# method [1] calls `rsample()` [2] which performs a Cholesky decomposition of +# the covariance matrix. The default in +# np.random.default_rng().multivariate_normal() is method="svd", which is +# slower but seemingly a bit more stable. +# +# [1] https://docs.gpytorch.ai/en/stable/distributions.html#gpytorch.distributions.MultivariateNormal.sample +# [2] https://docs.gpytorch.ai/en/stable/distributions.html#gpytorch.distributions.MultivariateNormal.rsample +torch.set_default_dtype(torch.float64) + +torch.manual_seed(123) +# - + +# # Generate toy 1D data +# +# Here we generate noisy 1D data `X_train`, `y_train` as well as an extended +# x-axis `X_pred` which we use later for prediction also outside of the data +# range (extrapolation). The data has a constant offset `const` which we use to +# test learning a GP mean function $m(\ve x)$. We create a gap in the data to +# show how the model uncertainty will behave there. + + +# + +def generate_data(x, gaps=[[1, 3]], const=5): + y = torch.sin(x) * torch.exp(-0.2 * x) + torch.randn(x.shape) * 0.1 + const + msk = torch.tensor([True] * len(x)) + if gaps is not None: + for g in gaps: + msk = msk & ~((x > g[0]) & (x < g[1])) + return x[msk], y[msk] + + +x = torch.linspace(0, 4 * math.pi, 100) +X_train, y_train = generate_data(x, gaps=[[6, 10]]) +X_pred = torch.linspace( + X_train[0] - 2, X_train[-1] + 2, 200, requires_grad=False +) + +print(f"{X_train.shape=}") +print(f"{y_train.shape=}") +print(f"{X_pred.shape=}") + +plt.scatter(X_train, y_train, marker="o", color="tab:blue") +# - + +# # Define GP model +# +# We define the simplest possible textbook GP model using a Gaussian +# likelihood. The kernel is the squared exponential kernel with a scaling +# factor. +# +# $$\kappa(\ve x_i, \ve x_j) = s\,\exp\left(-\frac{\lVert\ve x_i - \ve x_j\rVert_2^2}{2\,\ell^2}\right)$$ +# +# This makes two hyper params, namely the length scale $\ell$ and the scaling +# $s$. The latter is implemented by wrapping the `RBFKernel` with +# `ScaleKernel`. +# +# In addition, we define a constant mean via `ConstantMean`. Finally we have +# the likelihood noise $\sigma_n^2$. So in total we have 4 hyper params. +# +# * $\ell$ = `model.covar_module.base_kernel.lengthscale` +# * $\sigma_n^2$ = `model.likelihood.noise_covar.noise` +# * $s$ = `model.covar_module.outputscale` +# * $m(\ve x) = c$ = `model.mean_module.constant` + + +# + +class ExactGPModel(gpytorch.models.ExactGP): + """API: + + model.forward() prior f_pred + model() posterior f_pred + + likelihood(model.forward()) prior with noise y_pred + likelihood(model()) posterior with noise y_pred + """ + + def __init__(self, X_train, y_train, likelihood): + super().__init__(X_train, y_train, likelihood) + self.mean_module = gpytorch.means.ConstantMean() + self.covar_module = gpytorch.kernels.ScaleKernel( + gpytorch.kernels.RBFKernel() + ) + + def forward(self, x): + """The prior, defined in terms of the mean and covariance function.""" + mean_x = self.mean_module(x) + covar_x = self.covar_module(x) + return gpytorch.distributions.MultivariateNormal(mean_x, covar_x) + + +likelihood = gpytorch.likelihoods.GaussianLikelihood() +model = ExactGPModel(X_train, y_train, likelihood) +# - + + +# Inspect the model +print(model) + +# Default start hyper params +pprint(extract_model_params(model, raw=False)) + +# + +# Set new start hyper params +model.mean_module.constant = 3.0 +model.covar_module.base_kernel.lengthscale = 1.0 +model.covar_module.outputscale = 1.0 +model.likelihood.noise_covar.noise = 0.1 + +pprint(extract_model_params(model, raw=False)) +# - + + +# # Sample from the GP prior +# +# We sample a number of functions $f_j, j=1,\ldots,M$ from the GP prior and +# evaluate them at all $\ma X$ = `X_pred` points, of which we have $N=200$. So +# we effectively generate samples from $p(\predve f|\ma X) = \mathcal N(\ve +# c, \ma K)$. Each sampled vector $\predve f\in\mathbb R^{N}$ and the +# covariance (kernel) matrix is $\ma K\in\mathbb R^{N\times N}$. + +# + +model.eval() +likelihood.eval() + +with torch.no_grad(): + # Prior + M = 10 + pri_f = model.forward(X_pred) + f_mean = pri_f.mean + f_std = pri_f.stddev + f_samples = pri_f.sample(sample_shape=torch.Size((M,))) + print(f"{pri_f=}") + print(f"{pri_f.mean.shape=}") + print(f"{pri_f.covariance_matrix.shape=}") + print(f"{f_samples.shape=}") + fig, ax = plt.subplots() + ax.plot(X_pred, f_mean, color="tab:red", label="mean", lw=2) + plot_samples(ax, X_pred, f_samples, label="prior samples") + ax.fill_between( + X_pred, + f_mean - 2 * f_std, + f_mean + 2 * f_std, + color="tab:orange", + alpha=0.2, + label="confidence", + ) + ax.legend() +# - + + +# Let's investigate the samples more closely. A constant mean $\ve m(\ma X) = +# \ve c$ does *not* mean that each sampled vector $\predve f$'s mean is +# equal to $c$. Instead, we have that at each $\ve x_i$, the mean of +# *all* sampled functions is the same, so $\frac{1}{M}\sum_{j=1}^M f_j(\ve x_i) +# \approx c$ and for $M\rightarrow\infty$ it will be exactly $c$. +# + +# Look at the first 20 x points from M=10 samples +print(f"{f_samples.shape=}") +print(f"{f_samples.mean(axis=0)[:20]=}") +print(f"{f_samples.mean(axis=0).mean()=}") +print(f"{f_samples.mean(axis=0).std()=}") + +# Take more samples, the means should get closer to c +f_samples = pri_f.sample(sample_shape=torch.Size((M * 200,))) +print(f"{f_samples.shape=}") +print(f"{f_samples.mean(axis=0)[:20]=}") +print(f"{f_samples.mean(axis=0).mean()=}") +print(f"{f_samples.mean(axis=0).std()=}") + +# # GP posterior predictive distribution with fixed hyper params +# +# Now we calculate the posterior predictive distribution $p(\test{\predve +# f}|\test{\ma X}, \ma X, \ve y)$, i.e. we condition on the train data (Bayesian +# inference). +# +# We use the fixed hyper param values defined above. In particular, since +# $\sigma_n^2$ = `model.likelihood.noise_covar.noise` is > 0, we have a +# regression setting. + +# + +# Evaluation (predictive posterior) mode +model.eval() +likelihood.eval() + +with torch.no_grad(): + M = 10 + post_pred_f = model(X_pred) + + fig, ax = plt.subplots() + f_mean = post_pred_f.mean + f_samples = post_pred_f.sample(sample_shape=torch.Size((M,))) + f_std = post_pred_f.stddev + lower = f_mean - 2 * f_std + upper = f_mean + 2 * f_std + ax.plot( + X_train.numpy(), + y_train.numpy(), + "o", + label="data", + color="tab:blue", + ) + ax.plot( + X_pred.numpy(), + f_mean.numpy(), + label="mean", + color="tab:red", + lw=2, + ) + ax.fill_between( + X_pred.numpy(), + lower.numpy(), + upper.numpy(), + label="confidence", + color="tab:orange", + alpha=0.3, + ) + y_min = y_train.min() + y_max = y_train.max() + y_span = y_max - y_min + ax.set_ylim([y_min - 0.3 * y_span, y_max + 0.3 * y_span]) + plot_samples(ax, X_pred, f_samples, label="posterior pred. samples") + ax.legend() +# - + + +# # Fit GP to data: optimize hyper params +# +# In each step of the optimizer, we condition on the training data (e.g. do +# Bayesian inference) to calculate the posterior predictive distribution for +# the current values of the hyper params. We iterate until the log marginal +# likelihood is converged. +# +# We use a simplistic PyTorch-style hand written train loop without convergence +# control, so make sure to use enough `n_iter` and eyeball-check that the loss +# is converged :-) +# +# Observe how all hyper params converge. In particular, note that the constant +# mean $m(\ve x)=c$ converges to the `const` value in `generate_data()`. + +# + +# Train mode +model.train() +likelihood.train() + +optimizer = torch.optim.Adam(model.parameters(), lr=0.1) +loss_func = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model) + +n_iter = 200 +history = defaultdict(list) +for ii in range(n_iter): + optimizer.zero_grad() + loss = -loss_func(model(X_train), y_train) + loss.backward() + optimizer.step() + if (ii + 1) % 10 == 0: + print(f"iter {ii+1}/{n_iter}, {loss=:.3f}") + for p_name, p_val in extract_model_params(model).items(): + history[p_name].append(p_val) + history["loss"].append(loss.item()) +# - + +# Plot hyper params and loss (neg. log marginal likelihood) convergence +ncols = len(history) +fig, axs = plt.subplots(ncols=ncols, nrows=1, figsize=(ncols * 5, 5)) +for ax, (p_name, p_lst) in zip(axs, history.items()): + ax.plot(p_lst) + ax.set_title(p_name) + ax.set_xlabel("iterations") + +# Values of optimized hyper params +pprint(extract_model_params(model, raw=False)) + +# # Run prediction +# +# We show "noiseless" (left: $\sigma = \sqrt{\mathrm{diag}(\ma\Sigma)}$) vs. +# "noisy" (right: $\sigma = \sqrt{\mathrm{diag}(\ma\Sigma + \sigma_n^2\,\ma +# I_N)}$) predictions with +# +# $$\ma\Sigma = \testtest{\ma K} - \test{\ma K}\,(\ma K+\sigma_n^2\,\ma I)^{-1}\,\test{\ma K}^\top$$ +# +# We find that $\ma\Sigma$ reflects behavior we would like to see from +# epistemic uncertainty -- it is high when we have no data +# (out-of-distribution). But this alone isn't the whole story. We need to add +# the estimated noise level $\sigma_n^2$ in order for the confidence band to +# cover the data. + +# + +# Evaluation (predictive posterior) mode +model.eval() +likelihood.eval() + +with torch.no_grad(): + M = 10 + post_pred_f = model(X_pred) + post_pred_y = likelihood(model(X_pred)) + + fig, axs = plt.subplots(ncols=2, figsize=(12, 5)) + fig_sigmas, ax_sigmas = plt.subplots() + for ii, (ax, post_pred, name) in enumerate( + zip(axs, [post_pred_f, post_pred_y], ["f", "y"]) + ): + yf_mean = post_pred.mean + yf_samples = post_pred.sample(sample_shape=torch.Size((M,))) + + yf_std = post_pred.stddev + lower = yf_mean - 2 * yf_std + upper = yf_mean + 2 * yf_std + ax.plot( + X_train.numpy(), + y_train.numpy(), + "o", + label="data", + color="tab:blue", + ) + ax.plot( + X_pred.numpy(), + yf_mean.numpy(), + label="mean", + color="tab:red", + lw=2, + ) + ax.fill_between( + X_pred.numpy(), + lower.numpy(), + upper.numpy(), + label="confidence", + color="tab:orange", + alpha=0.3, + ) + if name == "f": + sigma_label = r"$\pm 2\sqrt{\mathrm{diag}(\Sigma)}$" + zorder = 1 + else: + sigma_label = ( + r"$\pm 2\sqrt{\mathrm{diag}(\Sigma + \sigma_n^2\,I)}$" + ) + zorder = 0 + ax_sigmas.fill_between( + X_pred.numpy(), + lower.numpy(), + upper.numpy(), + label="confidence " + sigma_label, + color="tab:orange" if name == "f" else "tab:blue", + alpha=0.5, + zorder=zorder, + ) + y_min = y_train.min() + y_max = y_train.max() + y_span = y_max - y_min + ax.set_ylim([y_min - 0.3 * y_span, y_max + 0.3 * y_span]) + plot_samples(ax, X_pred, yf_samples, label="posterior pred. samples") + if ii == 1: + ax.legend() + ax_sigmas.legend() +# - + +# + +# When running as script +if not is_interactive(): + plt.show() +# - diff --git a/BLcourse2.3/02_two_dim.ipynb b/BLcourse2.3/02_two_dim.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..a87daa957597ef0730b108251a6f5c459c7c8eec --- /dev/null +++ b/BLcourse2.3/02_two_dim.ipynb @@ -0,0 +1,594 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "b88a37d9", + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib notebook\n", + "##%matplotlib widget\n", + "##%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "64c21c28", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "from collections import defaultdict\n", + "from pprint import pprint\n", + "\n", + "import torch\n", + "import gpytorch\n", + "from matplotlib import pyplot as plt\n", + "from matplotlib import is_interactive\n", + "import numpy as np\n", + "\n", + "from sklearn.preprocessing import StandardScaler\n", + "\n", + "from utils import extract_model_params, fig_ax_3d" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "71da09db", + "metadata": {}, + "outputs": [], + "source": [ + "torch.set_default_dtype(torch.float64)\n", + "torch.manual_seed(123)" + ] + }, + { + "cell_type": "markdown", + "id": "e5786965", + "metadata": { + "lines_to_next_cell": 2 + }, + "source": [ + "# Generate toy 2D data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ee7a8e4a", + "metadata": {}, + "outputs": [], + "source": [ + "class MexicanHat:\n", + " def __init__(self, xlim, ylim, nx, ny, mode, **kwds):\n", + " self.xlim = xlim\n", + " self.ylim = ylim\n", + " self.nx = nx\n", + " self.ny = ny\n", + " self.xg, self.yg = self._get_xy_grid()\n", + " self.XG, self.YG = self._get_meshgrids(self.xg, self.yg)\n", + " self.X = self._make_X(mode)\n", + " self.z = self.func(self.X)\n", + "\n", + " def _make_X(self, mode=\"grid\"):\n", + " if mode == \"grid\":\n", + " X = torch.empty((self.nx * self.ny, 2))\n", + " X[:, 0] = self.XG.flatten()\n", + " X[:, 1] = self.YG.flatten()\n", + " elif mode == \"rand\":\n", + " X = torch.rand(self.nx * self.ny, 2)\n", + " X[:, 0] = X[:, 0] * (self.xlim[1] - self.xlim[0]) + self.xlim[0]\n", + " X[:, 1] = X[:, 1] * (self.ylim[1] - self.ylim[0]) + self.ylim[0]\n", + " return X\n", + "\n", + " def _get_xy_grid(self):\n", + " x = torch.linspace(self.xlim[0], self.xlim[1], self.nx)\n", + " y = torch.linspace(self.ylim[0], self.ylim[1], self.ny)\n", + " return x, y\n", + "\n", + " @staticmethod\n", + " def _get_meshgrids(xg, yg):\n", + " return torch.meshgrid(xg, yg, indexing=\"ij\")\n", + "\n", + " @staticmethod\n", + " def func(X):\n", + " r = torch.sqrt((X**2).sum(axis=1))\n", + " return torch.sin(r) / r\n", + "\n", + " @staticmethod\n", + " def n2t(x):\n", + " return torch.from_numpy(x)\n", + "\n", + " def apply_scalers(self, x_scaler, y_scaler):\n", + " self.X = self.n2t(x_scaler.transform(self.X))\n", + " Xtmp = x_scaler.transform(torch.stack((self.xg, self.yg), dim=1))\n", + " self.XG, self.YG = self._get_meshgrids(\n", + " self.n2t(Xtmp[:, 0]), self.n2t(Xtmp[:, 1])\n", + " )\n", + " self.z = self.n2t(y_scaler.transform(self.z[:, None])[:, 0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0058371a", + "metadata": {}, + "outputs": [], + "source": [ + "data_train = MexicanHat(\n", + " xlim=[-15, 25], ylim=[-15, 5], nx=20, ny=20, mode=\"rand\"\n", + ")\n", + "x_scaler = StandardScaler().fit(data_train.X)\n", + "y_scaler = StandardScaler().fit(data_train.z[:, None])\n", + "data_train.apply_scalers(x_scaler, y_scaler)\n", + "\n", + "data_pred = MexicanHat(\n", + " xlim=[-15, 25], ylim=[-15, 5], nx=100, ny=100, mode=\"grid\"\n", + ")\n", + "data_pred.apply_scalers(x_scaler, y_scaler)\n", + "\n", + "# train inputs\n", + "X_train = data_train.X\n", + "\n", + "# inputs for prediction and plotting\n", + "X_pred = data_pred.X" + ] + }, + { + "cell_type": "markdown", + "id": "3523c08d", + "metadata": {}, + "source": [ + "# Exercise 1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0ec4ad3d", + "metadata": {}, + "outputs": [], + "source": [ + "use_noise = False\n", + "use_gap = False" + ] + }, + { + "cell_type": "markdown", + "id": "f4592094", + "metadata": {}, + "source": [ + "# Exercise 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6bec0f58", + "metadata": {}, + "outputs": [], + "source": [ + "##use_noise = True\n", + "##use_gap = False" + ] + }, + { + "cell_type": "markdown", + "id": "865866ea", + "metadata": {}, + "source": [ + "# Exercise 3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5132eaeb", + "metadata": {}, + "outputs": [], + "source": [ + "##use_noise = False\n", + "##use_gap = True" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6efe09b3", + "metadata": {}, + "outputs": [], + "source": [ + "if use_noise:\n", + " # noisy train data\n", + " noise_std = 0.2\n", + " noise_dist = torch.distributions.Normal(loc=0, scale=noise_std)\n", + " y_train = data_train.z + noise_dist.sample_n(len(data_train.z))\n", + "else:\n", + " # noise-free train data\n", + " noise_std = 0\n", + " y_train = data_train.z" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "af637911", + "metadata": {}, + "outputs": [], + "source": [ + "# Cut out part of the train data to create out-of-distribution predictions\n", + "\n", + "if use_gap:\n", + " mask = (X_train[:, 0] > 0) & (X_train[:, 1] < 0)\n", + " X_train = X_train[~mask, :]\n", + " y_train = y_train[~mask]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eab0f4fa", + "metadata": {}, + "outputs": [], + "source": [ + "fig, ax = fig_ax_3d()\n", + "s0 = ax.plot_surface(\n", + " data_pred.XG,\n", + " data_pred.YG,\n", + " data_pred.z.reshape((data_pred.nx, data_pred.ny)),\n", + " color=\"tab:grey\",\n", + " alpha=0.5,\n", + ")\n", + "s1 = ax.scatter(\n", + " xs=X_train[:, 0],\n", + " ys=X_train[:, 1],\n", + " zs=y_train,\n", + " color=\"tab:blue\",\n", + " alpha=0.5,\n", + ")\n", + "ax.set_xlabel(\"X_0\")\n", + "ax.set_ylabel(\"X_1\")" + ] + }, + { + "cell_type": "markdown", + "id": "a00bf4e4", + "metadata": { + "lines_to_next_cell": 2 + }, + "source": [ + "# Define GP model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "52834c9f", + "metadata": {}, + "outputs": [], + "source": [ + "class ExactGPModel(gpytorch.models.ExactGP):\n", + " \"\"\"API:\n", + "\n", + " model.forward() prior f_pred\n", + " model() posterior f_pred\n", + "\n", + " likelihood(model.forward()) prior with noise y_pred\n", + " likelihood(model()) posterior with noise y_pred\n", + " \"\"\"\n", + "\n", + " def __init__(self, X_train, y_train, likelihood):\n", + " super().__init__(X_train, y_train, likelihood)\n", + " self.mean_module = gpytorch.means.ConstantMean()\n", + " self.covar_module = gpytorch.kernels.ScaleKernel(\n", + " gpytorch.kernels.RBFKernel()\n", + " )\n", + "\n", + " def forward(self, x):\n", + " \"\"\"The prior, defined in terms of the mean and covariance function.\"\"\"\n", + " mean_x = self.mean_module(x)\n", + " covar_x = self.covar_module(x)\n", + " return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)\n", + "\n", + "\n", + "likelihood = gpytorch.likelihoods.GaussianLikelihood()\n", + "model = ExactGPModel(X_train, y_train, likelihood)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1fcef3dc", + "metadata": {}, + "outputs": [], + "source": [ + "# Inspect the model\n", + "print(model)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "25397c1e", + "metadata": {}, + "outputs": [], + "source": [ + "# Default start hyper params\n", + "pprint(extract_model_params(model, raw=False))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a94067cc", + "metadata": {}, + "outputs": [], + "source": [ + "# Set new start hyper params\n", + "model.mean_module.constant = 0.0\n", + "model.covar_module.base_kernel.lengthscale = 3.0\n", + "model.covar_module.outputscale = 8.0\n", + "model.likelihood.noise_covar.noise = 0.1\n", + "\n", + "pprint(extract_model_params(model, raw=False))" + ] + }, + { + "cell_type": "markdown", + "id": "a5b1a9ee", + "metadata": {}, + "source": [ + "# Fit GP to data: optimize hyper params" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c15e6d45", + "metadata": {}, + "outputs": [], + "source": [ + "# Train mode\n", + "model.train()\n", + "likelihood.train()\n", + "\n", + "optimizer = torch.optim.Adam(model.parameters(), lr=0.2)\n", + "loss_func = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)\n", + "\n", + "n_iter = 300\n", + "history = defaultdict(list)\n", + "for ii in range(n_iter):\n", + " optimizer.zero_grad()\n", + " loss = -loss_func(model(X_train), y_train)\n", + " loss.backward()\n", + " optimizer.step()\n", + " if (ii + 1) % 10 == 0:\n", + " print(f\"iter {ii+1}/{n_iter}, {loss=:.3f}\")\n", + " for p_name, p_val in extract_model_params(model).items():\n", + " history[p_name].append(p_val)\n", + " history[\"loss\"].append(loss.item())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c7a4643", + "metadata": {}, + "outputs": [], + "source": [ + "ncols = len(history)\n", + "fig, axs = plt.subplots(ncols=ncols, nrows=1, figsize=(ncols * 5, 5))\n", + "for ax, (p_name, p_lst) in zip(axs, history.items()):\n", + " ax.plot(p_lst)\n", + " ax.set_title(p_name)\n", + " ax.set_xlabel(\"iterations\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5efa3b9d", + "metadata": {}, + "outputs": [], + "source": [ + "# Values of optimized hyper params\n", + "pprint(extract_model_params(model, raw=False))" + ] + }, + { + "cell_type": "markdown", + "id": "898f74f7", + "metadata": {}, + "source": [ + "# Run prediction" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "59e18623", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "model.eval()\n", + "likelihood.eval()\n", + "\n", + "with torch.no_grad():\n", + " post_pred_f = model(X_pred)\n", + " post_pred_y = likelihood(model(X_pred))\n", + "\n", + " fig = plt.figure()\n", + " ax = fig.add_subplot(projection=\"3d\")\n", + " ax.plot_surface(\n", + " data_pred.XG,\n", + " data_pred.YG,\n", + " data_pred.z.reshape((data_pred.nx, data_pred.ny)),\n", + " color=\"tab:grey\",\n", + " alpha=0.5,\n", + " )\n", + " ax.plot_surface(\n", + " data_pred.XG,\n", + " data_pred.YG,\n", + " post_pred_y.mean.reshape((data_pred.nx, data_pred.ny)),\n", + " color=\"tab:red\",\n", + " alpha=0.5,\n", + " )\n", + " ax.set_xlabel(\"X_0\")\n", + " ax.set_ylabel(\"X_1\")\n", + "\n", + "assert (post_pred_f.mean == post_pred_y.mean).all()" + ] + }, + { + "cell_type": "markdown", + "id": "591e453d", + "metadata": {}, + "source": [ + "# Plot difference to ground truth and uncertainty" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a7f294c4", + "metadata": {}, + "outputs": [], + "source": [ + "ncols = 3\n", + "fig, axs = plt.subplots(ncols=ncols, nrows=1, figsize=(ncols * 7, 5))\n", + "\n", + "vmax = post_pred_y.stddev.max()\n", + "cs = []\n", + "\n", + "cs.append(\n", + " axs[0].contourf(\n", + " data_pred.XG,\n", + " data_pred.YG,\n", + " torch.abs(post_pred_y.mean - data_pred.z).reshape(\n", + " (data_pred.nx, data_pred.ny)\n", + " ),\n", + " )\n", + ")\n", + "axs[0].set_title(\"|y_pred - y_true|\")\n", + "\n", + "cs.append(\n", + " axs[1].contourf(\n", + " data_pred.XG,\n", + " data_pred.YG,\n", + " post_pred_f.stddev.reshape((data_pred.nx, data_pred.ny)),\n", + " vmin=0,\n", + " vmax=vmax,\n", + " )\n", + ")\n", + "axs[1].set_title(\"f_std (epistemic)\")\n", + "\n", + "cs.append(\n", + " axs[2].contourf(\n", + " data_pred.XG,\n", + " data_pred.YG,\n", + " post_pred_y.stddev.reshape((data_pred.nx, data_pred.ny)),\n", + " vmin=0,\n", + " vmax=vmax,\n", + " )\n", + ")\n", + "axs[2].set_title(\"y_std (epistemic + aleatoric)\")\n", + "\n", + "for ax, c in zip(axs, cs):\n", + " ax.set_xlabel(\"X_0\")\n", + " ax.set_ylabel(\"X_1\")\n", + " ax.scatter(x=X_train[:, 0], y=X_train[:, 1], color=\"white\", alpha=0.2)\n", + " fig.colorbar(c, ax=ax)" + ] + }, + { + "cell_type": "markdown", + "id": "04257cea", + "metadata": {}, + "source": [ + "# Check learned noise" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d966f54", + "metadata": {}, + "outputs": [], + "source": [ + "print((post_pred_y.stddev**2 - post_pred_f.stddev**2).mean().sqrt())\n", + "print(\n", + " np.sqrt(\n", + " extract_model_params(model, raw=False)[\"likelihood.noise_covar.noise\"]\n", + " )\n", + ")\n", + "print(noise_std)" + ] + }, + { + "cell_type": "markdown", + "id": "1da209ff", + "metadata": {}, + "source": [ + "# Plot confidence bands" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f56936d", + "metadata": {}, + "outputs": [], + "source": [ + "y_mean = post_pred_y.mean.reshape((data_pred.nx, data_pred.ny))\n", + "y_std = post_pred_y.stddev.reshape((data_pred.nx, data_pred.ny))\n", + "upper = y_mean + 2 * y_std\n", + "lower = y_mean - 2 * y_std\n", + "\n", + "fig, ax = fig_ax_3d()\n", + "for Z, color in [(upper, \"tab:green\"), (lower, \"tab:red\")]:\n", + " ax.plot_surface(\n", + " data_pred.XG,\n", + " data_pred.YG,\n", + " Z,\n", + " color=color,\n", + " alpha=0.5,\n", + " )\n", + "\n", + "contour_z = lower.min() - 1\n", + "zlim = ax.get_xlim()\n", + "ax.set_zlim((contour_z, zlim[1] + abs(contour_z)))\n", + "ax.contourf(data_pred.XG, data_pred.YG, y_std, zdir=\"z\", offset=contour_z)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d25b4506", + "metadata": {}, + "outputs": [], + "source": [ + "# When running as script\n", + "if not is_interactive():\n", + " plt.show()" + ] + } + ], + "metadata": { + "jupytext": { + "formats": "ipynb,py:light" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/BLcourse2.3/02_two_dim.py b/BLcourse2.3/02_two_dim.py new file mode 100644 index 0000000000000000000000000000000000000000..b7ee2273a279ef69e86f6d47fb1ac71d4fb13f97 --- /dev/null +++ b/BLcourse2.3/02_two_dim.py @@ -0,0 +1,377 @@ +# --- +# jupyter: +# jupytext: +# formats: ipynb,py:light +# text_representation: +# extension: .py +# format_name: light +# format_version: '1.5' +# jupytext_version: 1.16.2 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# + +# %matplotlib notebook +# ##%matplotlib widget +# ##%matplotlib inline +# - + +# + +from collections import defaultdict +from pprint import pprint + +import torch +import gpytorch +from matplotlib import pyplot as plt +from matplotlib import is_interactive +import numpy as np + +from sklearn.preprocessing import StandardScaler + +from utils import extract_model_params, fig_ax_3d +# - + + +torch.set_default_dtype(torch.float64) +torch.manual_seed(123) + +# # Generate toy 2D data + + +class MexicanHat: + def __init__(self, xlim, ylim, nx, ny, mode, **kwds): + self.xlim = xlim + self.ylim = ylim + self.nx = nx + self.ny = ny + self.xg, self.yg = self._get_xy_grid() + self.XG, self.YG = self._get_meshgrids(self.xg, self.yg) + self.X = self._make_X(mode) + self.z = self.func(self.X) + + def _make_X(self, mode="grid"): + if mode == "grid": + X = torch.empty((self.nx * self.ny, 2)) + X[:, 0] = self.XG.flatten() + X[:, 1] = self.YG.flatten() + elif mode == "rand": + X = torch.rand(self.nx * self.ny, 2) + X[:, 0] = X[:, 0] * (self.xlim[1] - self.xlim[0]) + self.xlim[0] + X[:, 1] = X[:, 1] * (self.ylim[1] - self.ylim[0]) + self.ylim[0] + return X + + def _get_xy_grid(self): + x = torch.linspace(self.xlim[0], self.xlim[1], self.nx) + y = torch.linspace(self.ylim[0], self.ylim[1], self.ny) + return x, y + + @staticmethod + def _get_meshgrids(xg, yg): + return torch.meshgrid(xg, yg, indexing="ij") + + @staticmethod + def func(X): + r = torch.sqrt((X**2).sum(axis=1)) + return torch.sin(r) / r + + @staticmethod + def n2t(x): + return torch.from_numpy(x) + + def apply_scalers(self, x_scaler, y_scaler): + self.X = self.n2t(x_scaler.transform(self.X)) + Xtmp = x_scaler.transform(torch.stack((self.xg, self.yg), dim=1)) + self.XG, self.YG = self._get_meshgrids( + self.n2t(Xtmp[:, 0]), self.n2t(Xtmp[:, 1]) + ) + self.z = self.n2t(y_scaler.transform(self.z[:, None])[:, 0]) + + +# + +data_train = MexicanHat( + xlim=[-15, 25], ylim=[-15, 5], nx=20, ny=20, mode="rand" +) +x_scaler = StandardScaler().fit(data_train.X) +y_scaler = StandardScaler().fit(data_train.z[:, None]) +data_train.apply_scalers(x_scaler, y_scaler) + +data_pred = MexicanHat( + xlim=[-15, 25], ylim=[-15, 5], nx=100, ny=100, mode="grid" +) +data_pred.apply_scalers(x_scaler, y_scaler) + +# train inputs +X_train = data_train.X + +# inputs for prediction and plotting +X_pred = data_pred.X +# - + +# # Exercise 1 + +# + +use_noise = False +use_gap = False +# - + +# # Exercise 2 + +# + +##use_noise = True +##use_gap = False +# - + +# # Exercise 3 + +# + +##use_noise = False +##use_gap = True +# - + +# + +if use_noise: + # noisy train data + noise_std = 0.2 + noise_dist = torch.distributions.Normal(loc=0, scale=noise_std) + y_train = data_train.z + noise_dist.sample_n(len(data_train.z)) +else: + # noise-free train data + noise_std = 0 + y_train = data_train.z +# - + +# + +# Cut out part of the train data to create out-of-distribution predictions + +if use_gap: + mask = (X_train[:, 0] > 0) & (X_train[:, 1] < 0) + X_train = X_train[~mask, :] + y_train = y_train[~mask] +# - + +fig, ax = fig_ax_3d() +s0 = ax.plot_surface( + data_pred.XG, + data_pred.YG, + data_pred.z.reshape((data_pred.nx, data_pred.ny)), + color="tab:grey", + alpha=0.5, +) +s1 = ax.scatter( + xs=X_train[:, 0], + ys=X_train[:, 1], + zs=y_train, + color="tab:blue", + alpha=0.5, +) +ax.set_xlabel("X_0") +ax.set_ylabel("X_1") + +# # Define GP model + + +# + +class ExactGPModel(gpytorch.models.ExactGP): + """API: + + model.forward() prior f_pred + model() posterior f_pred + + likelihood(model.forward()) prior with noise y_pred + likelihood(model()) posterior with noise y_pred + """ + + def __init__(self, X_train, y_train, likelihood): + super().__init__(X_train, y_train, likelihood) + self.mean_module = gpytorch.means.ConstantMean() + self.covar_module = gpytorch.kernels.ScaleKernel( + gpytorch.kernels.RBFKernel() + ) + + def forward(self, x): + """The prior, defined in terms of the mean and covariance function.""" + mean_x = self.mean_module(x) + covar_x = self.covar_module(x) + return gpytorch.distributions.MultivariateNormal(mean_x, covar_x) + + +likelihood = gpytorch.likelihoods.GaussianLikelihood() +model = ExactGPModel(X_train, y_train, likelihood) +# - + +# Inspect the model +print(model) + +# Default start hyper params +pprint(extract_model_params(model, raw=False)) + +# + +# Set new start hyper params +model.mean_module.constant = 0.0 +model.covar_module.base_kernel.lengthscale = 3.0 +model.covar_module.outputscale = 8.0 +model.likelihood.noise_covar.noise = 0.1 + +pprint(extract_model_params(model, raw=False)) +# - + +# # Fit GP to data: optimize hyper params + +# + +# Train mode +model.train() +likelihood.train() + +optimizer = torch.optim.Adam(model.parameters(), lr=0.2) +loss_func = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model) + +n_iter = 300 +history = defaultdict(list) +for ii in range(n_iter): + optimizer.zero_grad() + loss = -loss_func(model(X_train), y_train) + loss.backward() + optimizer.step() + if (ii + 1) % 10 == 0: + print(f"iter {ii+1}/{n_iter}, {loss=:.3f}") + for p_name, p_val in extract_model_params(model).items(): + history[p_name].append(p_val) + history["loss"].append(loss.item()) +# - + +ncols = len(history) +fig, axs = plt.subplots(ncols=ncols, nrows=1, figsize=(ncols * 5, 5)) +for ax, (p_name, p_lst) in zip(axs, history.items()): + ax.plot(p_lst) + ax.set_title(p_name) + ax.set_xlabel("iterations") + +# Values of optimized hyper params +pprint(extract_model_params(model, raw=False)) + +# # Run prediction + +# + +model.eval() +likelihood.eval() + +with torch.no_grad(): + post_pred_f = model(X_pred) + post_pred_y = likelihood(model(X_pred)) + + fig = plt.figure() + ax = fig.add_subplot(projection="3d") + ax.plot_surface( + data_pred.XG, + data_pred.YG, + data_pred.z.reshape((data_pred.nx, data_pred.ny)), + color="tab:grey", + alpha=0.5, + ) + ax.plot_surface( + data_pred.XG, + data_pred.YG, + post_pred_y.mean.reshape((data_pred.nx, data_pred.ny)), + color="tab:red", + alpha=0.5, + ) + ax.set_xlabel("X_0") + ax.set_ylabel("X_1") + +assert (post_pred_f.mean == post_pred_y.mean).all() +# - + + +# # Plot difference to ground truth and uncertainty + +# + +ncols = 3 +fig, axs = plt.subplots(ncols=ncols, nrows=1, figsize=(ncols * 7, 5)) + +vmax = post_pred_y.stddev.max() +cs = [] + +cs.append( + axs[0].contourf( + data_pred.XG, + data_pred.YG, + torch.abs(post_pred_y.mean - data_pred.z).reshape( + (data_pred.nx, data_pred.ny) + ), + ) +) +axs[0].set_title("|y_pred - y_true|") + +cs.append( + axs[1].contourf( + data_pred.XG, + data_pred.YG, + post_pred_f.stddev.reshape((data_pred.nx, data_pred.ny)), + vmin=0, + vmax=vmax, + ) +) +axs[1].set_title("f_std (epistemic)") + +cs.append( + axs[2].contourf( + data_pred.XG, + data_pred.YG, + post_pred_y.stddev.reshape((data_pred.nx, data_pred.ny)), + vmin=0, + vmax=vmax, + ) +) +axs[2].set_title("y_std (epistemic + aleatoric)") + +for ax, c in zip(axs, cs): + ax.set_xlabel("X_0") + ax.set_ylabel("X_1") + ax.scatter(x=X_train[:, 0], y=X_train[:, 1], color="white", alpha=0.2) + fig.colorbar(c, ax=ax) +# - + +# # Check learned noise + +# + +print((post_pred_y.stddev**2 - post_pred_f.stddev**2).mean().sqrt()) +print( + np.sqrt( + extract_model_params(model, raw=False)["likelihood.noise_covar.noise"] + ) +) +print(noise_std) +# - + +# # Plot confidence bands + +# + +y_mean = post_pred_y.mean.reshape((data_pred.nx, data_pred.ny)) +y_std = post_pred_y.stddev.reshape((data_pred.nx, data_pred.ny)) +upper = y_mean + 2 * y_std +lower = y_mean - 2 * y_std + +fig, ax = fig_ax_3d() +for Z, color in [(upper, "tab:green"), (lower, "tab:red")]: + ax.plot_surface( + data_pred.XG, + data_pred.YG, + Z, + color=color, + alpha=0.5, + ) + +contour_z = lower.min() - 1 +zlim = ax.get_xlim() +ax.set_zlim((contour_z, zlim[1] + abs(contour_z))) +ax.contourf(data_pred.XG, data_pred.YG, y_std, zdir="z", offset=contour_z) +# - + +# + +# When running as script +if not is_interactive(): + plt.show() +# - diff --git a/BLcourse2.3/README.md b/BLcourse2.3/README.md new file mode 100644 index 0000000000000000000000000000000000000000..21766d2c861fc568cce33e935241b908c3ce8938 --- /dev/null +++ b/BLcourse2.3/README.md @@ -0,0 +1,11 @@ +Based on https://docs.gpytorch.ai/en/stable/examples/01_Exact_GPs/Simple_GP_Regression.html + +Use https://jupytext.readthedocs.io to create a notebook. + +```sh +$ jupytext --to ipynb notebook.py +$ jupyter-notebook notebook.ipynb +``` + +For convenience the ipynb file is included. Please keep it in sync with the py +file using jupytext. diff --git a/BLcourse2.3/convert-to-ipynb.sh b/BLcourse2.3/convert-to-ipynb.sh new file mode 100755 index 0000000000000000000000000000000000000000..be23e43b6e01e4ad115436765c007a8a12c78684 --- /dev/null +++ b/BLcourse2.3/convert-to-ipynb.sh @@ -0,0 +1,5 @@ +#!/bin/sh + +for fn in 0*dim.py; do + jupytext --to ipynb $fn +done diff --git a/BLcourse2.3/utils.py b/BLcourse2.3/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..fc512d45c86aeb380087bd297aed0b3b29d84d7b --- /dev/null +++ b/BLcourse2.3/utils.py @@ -0,0 +1,45 @@ +from matplotlib import pyplot as plt + + +def extract_model_params(model, raw=False) -> dict: + """Helper to convert model.named_parameters() to dict. + + With raw=True, use + foo.bar.raw_param + else + foo.bar.param + + See https://docs.gpytorch.ai/en/stable/examples/00_Basic_Usage/Hyperparameters.html#Raw-vs-Actual-Parameters + """ + if raw: + return dict( + (p_name, p_val.item()) + for p_name, p_val in model.named_parameters() + ) + else: + out = dict() + # p_name = 'covar_module.base_kernel.raw_lengthscale'. Access + # model.covar_module.base_kernel.lengthscale (w/o the raw_) + for p_name, p_val in model.named_parameters(): + # Yes, eval() hack. Sorry. + p_name = p_name.replace(".raw_", ".") + p_val = eval(f"model.{p_name}") + out[p_name] = p_val.item() + return out + + +def fig_ax_3d(): + fig = plt.figure() + ax = fig.add_subplot(projection="3d") + return fig, ax + + +def plot_samples(ax, X_pred, samples, label=None, **kwds): + plot_kwds = dict(color="tab:green", alpha=0.3) + plot_kwds.update(kwds) + + if label is None: + ax.plot(X_pred, samples.T, **plot_kwds) + else: + ax.plot(X_pred, samples[0, :], **plot_kwds, label=label) + ax.plot(X_pred, samples[1:, :].T, **plot_kwds, label="_") diff --git a/slides/BLcourse2.3.pdf b/slides/BLcourse2.3.pdf new file mode 100644 index 0000000000000000000000000000000000000000..9e1b7f8cb80b4521c9b003cfec3fd8288ac81d5f Binary files /dev/null and b/slides/BLcourse2.3.pdf differ