diff --git a/BLcourse2.3/01_one_dim.ipynb b/BLcourse2.3/01_one_dim.ipynb index f5208de38e0c4b0cc1a1ed052d6d9393a318aa04..7c993e8f48af0f824f9bc92a6af1fde90023830b 100644 --- a/BLcourse2.3/01_one_dim.ipynb +++ b/BLcourse2.3/01_one_dim.ipynb @@ -3,16 +3,19 @@ { "cell_type": "markdown", "id": "2a8ed8ae", - "metadata": {}, + "metadata": { + "lines_to_next_cell": 2 + }, "source": [ "# Notation\n", "$\\newcommand{\\ve}[1]{\\mathit{\\boldsymbol{#1}}}$\n", "$\\newcommand{\\ma}[1]{\\mathbf{#1}}$\n", "$\\newcommand{\\pred}[1]{\\rm{#1}}$\n", "$\\newcommand{\\predve}[1]{\\mathbf{#1}}$\n", - "$\\newcommand{\\cov}{\\mathrm{cov}}$\n", "$\\newcommand{\\test}[1]{#1_*}$\n", "$\\newcommand{\\testtest}[1]{#1_{**}}$\n", + "$\\DeclareMathOperator{\\diag}{diag}$\n", + "$\\DeclareMathOperator{\\cov}{cov}$\n", "\n", "Vector $\\ve a\\in\\mathbb R^n$ or $\\mathbb R^{n\\times 1}$, so \"column\" vector.\n", "Matrix $\\ma A\\in\\mathbb R^{n\\times m}$. Design matrix with input vectors $\\ve\n", @@ -24,6 +27,19 @@ "$\\ma X$ to keep the notation consistent with the slides." ] }, + { + "cell_type": "markdown", + "id": "8a7cc3c8", + "metadata": {}, + "source": [ + "# About\n", + "\n", + "In this notebook, we explore the material presented in [the\n", + "slides](https://doi.org/10.6084/m9.figshare.25988176) with code, using the\n", + "[gpytorch](https://gpytorch.ai) library. We cover exact GP inference and\n", + "hyper parameter optimization using the marginal likelihood." + ] + }, { "cell_type": "markdown", "id": "ae440984", @@ -40,8 +56,8 @@ "outputs": [], "source": [ "##%matplotlib notebook\n", - "##%matplotlib widget\n", - "%matplotlib inline" + "%matplotlib widget\n", + "##%matplotlib inline" ] }, { @@ -59,6 +75,7 @@ "import gpytorch\n", "from matplotlib import pyplot as plt\n", "from matplotlib import is_interactive\n", + "import numpy as np\n", "\n", "from utils import extract_model_params, plot_samples\n", "\n", @@ -103,26 +120,41 @@ "metadata": {}, "outputs": [], "source": [ - "def generate_data(x, gaps=[[1, 3]], const=5):\n", - " y = torch.sin(x) * torch.exp(-0.2 * x) + torch.randn(x.shape) * 0.1 + const\n", + "def ground_truth(x, const):\n", + " return torch.sin(x) * torch.exp(-0.2 * x) + const\n", + "\n", + "\n", + "def generate_data(x, gaps=[[1, 3]], const=None, noise_std=None):\n", + " noise_dist = torch.distributions.Normal(loc=0, scale=noise_std)\n", + " y = ground_truth(x, const=const) + noise_dist.sample(\n", + " sample_shape=(len(x),)\n", + " )\n", " msk = torch.tensor([True] * len(x))\n", " if gaps is not None:\n", " for g in gaps:\n", " msk = msk & ~((x > g[0]) & (x < g[1]))\n", - " return x[msk], y[msk]\n", + " return x[msk], y[msk], y\n", "\n", "\n", + "const = 5.0\n", + "noise_std = 0.1\n", "x = torch.linspace(0, 4 * math.pi, 100)\n", - "X_train, y_train = generate_data(x, gaps=[[6, 10]])\n", + "X_train, y_train, y_gt_train = generate_data(\n", + " x, gaps=[[6, 10]], const=const, noise_std=noise_std\n", + ")\n", "X_pred = torch.linspace(\n", " X_train[0] - 2, X_train[-1] + 2, 200, requires_grad=False\n", ")\n", + "y_gt_pred = ground_truth(X_pred, const=const)\n", "\n", "print(f\"{X_train.shape=}\")\n", "print(f\"{y_train.shape=}\")\n", "print(f\"{X_pred.shape=}\")\n", "\n", - "plt.scatter(X_train, y_train, marker=\"o\", color=\"tab:blue\")" + "fig, ax = plt.subplots()\n", + "ax.scatter(X_train, y_train, marker=\"o\", color=\"tab:blue\", label=\"noisy data\")\n", + "ax.plot(X_pred, y_gt_pred, ls=\"--\", color=\"k\", label=\"ground truth\")\n", + "ax.legend()" ] }, { @@ -145,7 +177,7 @@ "`ScaleKernel`.\n", "\n", "In addition, we define a constant mean via `ConstantMean`. Finally we have\n", - "the likelihood noise $\\sigma_n^2$. So in total we have 4 hyper params.\n", + "the likelihood variance $\\sigma_n^2$. So in total we have 4 hyper params.\n", "\n", "* $\\ell$ = `model.covar_module.base_kernel.lengthscale`\n", "* $\\sigma_n^2$ = `model.likelihood.noise_covar.noise`\n", @@ -209,7 +241,7 @@ "outputs": [], "source": [ "# Default start hyper params\n", - "pprint(extract_model_params(model, raw=False))" + "pprint(extract_model_params(model))" ] }, { @@ -225,9 +257,9 @@ "model.mean_module.constant = 3.0\n", "model.covar_module.base_kernel.lengthscale = 1.0\n", "model.covar_module.outputscale = 1.0\n", - "model.likelihood.noise_covar.noise = 0.1\n", + "model.likelihood.noise_covar.noise = 1e-3\n", "\n", - "pprint(extract_model_params(model, raw=False))" + "pprint(extract_model_params(model))" ] }, { @@ -237,11 +269,16 @@ "source": [ "# Sample from the GP prior\n", "\n", - "We sample a number of functions $f_j, j=1,\\ldots,M$ from the GP prior and\n", + "We sample a number of functions $f_m, m=1,\\ldots,M$ from the GP prior and\n", "evaluate them at all $\\ma X$ = `X_pred` points, of which we have $N=200$. So\n", - "we effectively generate samples from $p(\\predve f|\\ma X) = \\mathcal N(\\ve\n", - "c, \\ma K)$. Each sampled vector $\\predve f\\in\\mathbb R^{N}$ and the\n", - "covariance (kernel) matrix is $\\ma K\\in\\mathbb R^{N\\times N}$." + "we effectively generate samples from `pri_f` = $p(\\predve f|\\ma X) = \\mathcal N(\\ve\n", + "c, \\ma K)$. Each sampled vector $\\predve f\\in\\mathbb R^{N}$ represents a\n", + "sampled *function* $f$ evaluated the $N=200$ points in $\\ma X$. The\n", + "covariance (kernel) matrix is $\\ma K\\in\\mathbb R^{N\\times N}$. Its diagonal\n", + "$\\diag\\ma K$ = `f_std**2` represents the variance at each point on the $x$-axis.\n", + "This is what we plot as \"confidence band\" `f_mean` $\\pm$ `2 * f_std`.\n", + "The off-diagonal elements represent the correlation between different points\n", + "$K_{ij} = \\cov[f(\\ve x_i), f(\\ve x_j)]$." ] }, { @@ -286,11 +323,12 @@ "id": "0598b567", "metadata": {}, "source": [ - "Let's investigate the samples more closely. A constant mean $\\ve m(\\ma X) =\n", - "\\ve c$ does *not* mean that each sampled vector $\\predve f$'s mean is\n", - "equal to $c$. Instead, we have that at each $\\ve x_i$, the mean of\n", - "*all* sampled functions is the same, so $\\frac{1}{M}\\sum_{j=1}^M f_j(\\ve x_i)\n", - "\\approx c$ and for $M\\rightarrow\\infty$ it will be exactly $c$.\n" + "Let's investigate the samples more closely. First we note that the samples\n", + "fluctuate around the mean `model.mean_module.constant` we defined above. A\n", + "constant mean $\\ve m(\\ma X) = \\ve c$ does *not* mean that each sampled vector\n", + "$\\predve f$'s mean is equal to $c$. Instead, we have that at each $\\ve x_i$,\n", + "the mean of *all* sampled functions is the same, so $\\frac{1}{M}\\sum_{j=1}^M\n", + "f_m(\\ve x_i) \\approx c$ and for $M\\rightarrow\\infty$ it will be exactly $c$.\n" ] }, { @@ -330,21 +368,20 @@ "# GP posterior predictive distribution with fixed hyper params\n", "\n", "Now we calculate the posterior predictive distribution $p(\\test{\\predve\n", - "f}|\\test{\\ma X}, \\ma X, \\ve y)$, i.e. we condition on the train data (Bayesian\n", + "f}|\\test{\\ma X}, \\ma X, \\ve y) = \\mathcal N(\\test{\\ve\\mu}, \\test{\\ma\\Sigma})$,\n", + "i.e. we condition on the train data (Bayesian\n", "inference).\n", "\n", "We use the fixed hyper param values defined above. In particular, since\n", "$\\sigma_n^2$ = `model.likelihood.noise_covar.noise` is > 0, we have a\n", - "regression setting." + "regression setting -- the GP's mean doesn't interpolate all points." ] }, { "cell_type": "code", "execution_count": null, "id": "d01951f1", - "metadata": { - "lines_to_next_cell": 2 - }, + "metadata": {}, "outputs": [], "source": [ "# Evaluation (predictive posterior) mode\n", @@ -375,6 +412,14 @@ " color=\"tab:red\",\n", " lw=2,\n", " )\n", + " ax.plot(\n", + " X_pred.numpy(),\n", + " y_gt_pred.numpy(),\n", + " label=\"ground truth\",\n", + " color=\"k\",\n", + " lw=2,\n", + " ls=\"--\",\n", + " )\n", " ax.fill_between(\n", " X_pred.numpy(),\n", " lower.numpy(),\n", @@ -395,20 +440,30 @@ "cell_type": "markdown", "id": "b960a745", "metadata": {}, + "source": [ + "We observe that all sampled functions (green) and the mean (red) tend towards\n", + "the low fixed mean function $m(\\ve x)=c$ at 3.0 in the absence of data, while\n", + "the actual data mean is `const` from above (data generation). Also the other\n", + "hyper params ($\\ell$, $\\sigma_n^2$, $s$) are just guesses. Now we will\n", + "calculate their actual value by minimizing the negative log marginal\n", + "likelihood." + ] + }, + { + "cell_type": "markdown", + "id": "2de7f663", + "metadata": {}, "source": [ "# Fit GP to data: optimize hyper params\n", "\n", "In each step of the optimizer, we condition on the training data (e.g. do\n", "Bayesian inference) to calculate the posterior predictive distribution for\n", - "the current values of the hyper params. We iterate until the log marginal\n", + "the current values of the hyper params. We iterate until the negative log marginal\n", "likelihood is converged.\n", "\n", "We use a simplistic PyTorch-style hand written train loop without convergence\n", "control, so make sure to use enough `n_iter` and eyeball-check that the loss\n", - "is converged :-)\n", - "\n", - "Observe how all hyper params converge. In particular, note that the constant\n", - "mean $m(\\ve x)=c$ converges to the `const` value in `generate_data()`." + "is converged :-)" ] }, { @@ -433,8 +488,8 @@ " loss.backward()\n", " optimizer.step()\n", " if (ii + 1) % 10 == 0:\n", - " print(f\"iter {ii+1}/{n_iter}, {loss=:.3f}\")\n", - " for p_name, p_val in extract_model_params(model).items():\n", + " print(f\"iter {ii + 1}/{n_iter}, {loss=:.3f}\")\n", + " for p_name, p_val in extract_model_params(model, try_item=True).items():\n", " history[p_name].append(p_val)\n", " history[\"loss\"].append(loss.item())" ] @@ -446,13 +501,16 @@ "metadata": {}, "outputs": [], "source": [ - "# Plot hyper params and loss (neg. log marginal likelihood) convergence\n", + "# Plot hyper params and loss (negative log marginal likelihood) convergence\n", "ncols = len(history)\n", - "fig, axs = plt.subplots(ncols=ncols, nrows=1, figsize=(ncols * 5, 5))\n", - "for ax, (p_name, p_lst) in zip(axs, history.items()):\n", - " ax.plot(p_lst)\n", - " ax.set_title(p_name)\n", - " ax.set_xlabel(\"iterations\")" + "fig, axs = plt.subplots(\n", + " ncols=ncols, nrows=1, figsize=(ncols * 3, 3), layout=\"compressed\"\n", + ")\n", + "with torch.no_grad():\n", + " for ax, (p_name, p_lst) in zip(axs, history.items()):\n", + " ax.plot(p_lst)\n", + " ax.set_title(p_name)\n", + " ax.set_xlabel(\"iterations\")" ] }, { @@ -463,27 +521,33 @@ "outputs": [], "source": [ "# Values of optimized hyper params\n", - "pprint(extract_model_params(model, raw=False))" + "pprint(extract_model_params(model))" ] }, { "cell_type": "markdown", "id": "98aefb90", "metadata": {}, + "source": [ + "We see that all hyper params converge. In particular, note that the constant\n", + "mean $m(\\ve x)=c$ converges to the `const` value in `generate_data()`." + ] + }, + { + "cell_type": "markdown", + "id": "b2f6afb0", + "metadata": {}, "source": [ "# Run prediction\n", "\n", - "We show \"noiseless\" (left: $\\sigma = \\sqrt{\\mathrm{diag}(\\ma\\Sigma)}$) vs.\n", - "\"noisy\" (right: $\\sigma = \\sqrt{\\mathrm{diag}(\\ma\\Sigma + \\sigma_n^2\\,\\ma\n", - "I_N)}$) predictions with\n", - "\n", - "$$\\ma\\Sigma = \\testtest{\\ma K} - \\test{\\ma K}\\,(\\ma K+\\sigma_n^2\\,\\ma I)^{-1}\\,\\test{\\ma K}^\\top$$\n", + "We run prediction with two variants of the posterior predictive distribution:\n", + "using either only the epistemic uncertainty or using the total uncertainty.\n", "\n", - "We find that $\\ma\\Sigma$ reflects behavior we would like to see from\n", - "epistemic uncertainty -- it is high when we have no data\n", - "(out-of-distribution). But this alone isn't the whole story. We need to add\n", - "the estimated noise level $\\sigma_n^2$ in order for the confidence band to\n", - "cover the data." + "* epistemic: $p(\\test{\\predve f}|\\test{\\ma X}, \\ma X, \\ve y) =\n", + " \\mathcal N(\\test{\\ve\\mu}, \\test{\\ma\\Sigma})$ = `post_pred_f` with\n", + " $\\test{\\ma\\Sigma} = \\testtest{\\ma K} - \\test{\\ma K}\\,(\\ma K+\\sigma_n^2\\,\\ma I)^{-1}\\,\\test{\\ma K}^\\top$\n", + "* total: $p(\\test{\\predve y}|\\test{\\ma X}, \\ma X, \\ve y) =\n", + " \\mathcal N(\\test{\\ve\\mu}, \\test{\\ma\\Sigma} + \\sigma_n^2\\,\\ma I_N))$ = `post_pred_y`" ] }, { @@ -502,10 +566,15 @@ " post_pred_f = model(X_pred)\n", " post_pred_y = likelihood(model(X_pred))\n", "\n", - " fig, axs = plt.subplots(ncols=2, figsize=(12, 5))\n", + " fig, axs = plt.subplots(ncols=2, figsize=(14, 5), sharex=True, sharey=True)\n", " fig_sigmas, ax_sigmas = plt.subplots()\n", - " for ii, (ax, post_pred, name) in enumerate(\n", - " zip(axs, [post_pred_f, post_pred_y], [\"f\", \"y\"])\n", + " for ii, (ax, post_pred, name, title) in enumerate(\n", + " zip(\n", + " axs,\n", + " [post_pred_f, post_pred_y],\n", + " [\"f\", \"y\"],\n", + " [\"epistemic uncertainty\", \"total uncertainty\"],\n", + " )\n", " ):\n", " yf_mean = post_pred.mean\n", " yf_samples = post_pred.sample(sample_shape=torch.Size((M,)))\n", @@ -527,6 +596,14 @@ " color=\"tab:red\",\n", " lw=2,\n", " )\n", + " ax.plot(\n", + " X_pred.numpy(),\n", + " y_gt_pred.numpy(),\n", + " label=\"ground truth\",\n", + " color=\"k\",\n", + " lw=2,\n", + " ls=\"--\",\n", + " )\n", " ax.fill_between(\n", " X_pred.numpy(),\n", " lower.numpy(),\n", @@ -535,19 +612,20 @@ " color=\"tab:orange\",\n", " alpha=0.3,\n", " )\n", + " ax.set_title(f\"confidence = {title}\")\n", " if name == \"f\":\n", - " sigma_label = r\"$\\pm 2\\sqrt{\\mathrm{diag}(\\Sigma)}$\"\n", + " sigma_label = r\"epistemic: $\\pm 2\\sqrt{\\mathrm{diag}(\\Sigma_*)}$\"\n", " zorder = 1\n", " else:\n", " sigma_label = (\n", - " r\"$\\pm 2\\sqrt{\\mathrm{diag}(\\Sigma + \\sigma_n^2\\,I)}$\"\n", + " r\"total: $\\pm 2\\sqrt{\\mathrm{diag}(\\Sigma_* + \\sigma_n^2\\,I)}$\"\n", " )\n", " zorder = 0\n", " ax_sigmas.fill_between(\n", " X_pred.numpy(),\n", " lower.numpy(),\n", " upper.numpy(),\n", - " label=\"confidence \" + sigma_label,\n", + " label=sigma_label,\n", " color=\"tab:orange\" if name == \"f\" else \"tab:blue\",\n", " alpha=0.5,\n", " zorder=zorder,\n", @@ -559,9 +637,62 @@ " plot_samples(ax, X_pred, yf_samples, label=\"posterior pred. samples\")\n", " if ii == 1:\n", " ax.legend()\n", + " ax_sigmas.set_title(\"total vs. epistemic uncertainty\")\n", " ax_sigmas.legend()" ] }, + { + "cell_type": "markdown", + "id": "4c52a919", + "metadata": {}, + "source": [ + "We find that $\\test{\\ma\\Sigma}$ reflects behavior we would like to see from\n", + "epistemic uncertainty -- it is high when we have no data\n", + "(out-of-distribution). But this alone isn't the whole story. We need to add\n", + "the estimated likelihood variance $\\sigma_n^2$ in order for the confidence\n", + "band to cover the data -- this turns the estimated total uncertainty into a\n", + "*calibrated* uncertainty." + ] + }, + { + "cell_type": "markdown", + "id": "8823deb0", + "metadata": {}, + "source": [ + "# Let's check the learned noise\n", + "\n", + "We compare the target data noise (`noise_std`) to the learned GP noise, in\n", + "the form of the likelihood *standard deviation* $\\sigma_n$. The latter is\n", + "equal to the $\\sqrt{\\cdot}$ of `likelihood.noise_covar.noise` and can also be\n", + "calculated via $\\sqrt{\\diag(\\test{\\ma\\Sigma} + \\sigma_n^2\\,\\ma I_N) -\n", + "\\diag(\\test{\\ma\\Sigma}})$." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d7cd1c02", + "metadata": {}, + "outputs": [], + "source": [ + "# Target noise to learn\n", + "print(\"data noise:\", noise_std)\n", + "\n", + "# The two below must be the same\n", + "print(\n", + " \"learned noise:\",\n", + " (post_pred_y.stddev**2 - post_pred_f.stddev**2).mean().sqrt().item(),\n", + ")\n", + "print(\n", + " \"learned noise:\",\n", + " np.sqrt(\n", + " extract_model_params(model, try_item=True)[\n", + " \"likelihood.noise_covar.noise\"\n", + " ]\n", + " ),\n", + ")" + ] + }, { "cell_type": "code", "execution_count": null, @@ -580,9 +711,21 @@ "formats": "ipynb,py:light" }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "bayes-ml-course", "language": "python", - "name": "python3" + "name": "bayes-ml-course" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.3" } }, "nbformat": 4, diff --git a/BLcourse2.3/01_one_dim.py b/BLcourse2.3/01_one_dim.py index e9e55f233fc306eb2da38ca98ce710d5a359daa0..f29e5bbd6556259f674c9c8fcf7b3935461ddc94 100644 --- a/BLcourse2.3/01_one_dim.py +++ b/BLcourse2.3/01_one_dim.py @@ -6,11 +6,11 @@ # extension: .py # format_name: light # format_version: '1.5' -# jupytext_version: 1.16.2 +# jupytext_version: 1.17.1 # kernelspec: -# display_name: Python 3 (ipykernel) +# display_name: bayes-ml-course # language: python -# name: python3 +# name: bayes-ml-course # --- # # Notation @@ -18,9 +18,10 @@ # $\newcommand{\ma}[1]{\mathbf{#1}}$ # $\newcommand{\pred}[1]{\rm{#1}}$ # $\newcommand{\predve}[1]{\mathbf{#1}}$ -# $\newcommand{\cov}{\mathrm{cov}}$ # $\newcommand{\test}[1]{#1_*}$ # $\newcommand{\testtest}[1]{#1_{**}}$ +# $\DeclareMathOperator{\diag}{diag}$ +# $\DeclareMathOperator{\cov}{cov}$ # # Vector $\ve a\in\mathbb R^n$ or $\mathbb R^{n\times 1}$, so "column" vector. # Matrix $\ma A\in\mathbb R^{n\times m}$. Design matrix with input vectors $\ve @@ -31,11 +32,19 @@ # we still denote the collection of all $\ve x_i = x_i\in\mathbb R$ points with # $\ma X$ to keep the notation consistent with the slides. + +# # About +# +# In this notebook, we explore the material presented in [the +# slides](https://doi.org/10.6084/m9.figshare.25988176) with code, using the +# [gpytorch](https://gpytorch.ai) library. We cover exact GP inference and +# hyper parameter optimization using the marginal likelihood. + # # Imports, helpers, setup # ##%matplotlib notebook -# ##%matplotlib widget -# %matplotlib inline +# %matplotlib widget +# ##%matplotlib inline # + import math @@ -46,6 +55,7 @@ import torch import gpytorch from matplotlib import pyplot as plt from matplotlib import is_interactive +import numpy as np from utils import extract_model_params, plot_samples @@ -77,26 +87,41 @@ torch.manual_seed(123) # + -def generate_data(x, gaps=[[1, 3]], const=5): - y = torch.sin(x) * torch.exp(-0.2 * x) + torch.randn(x.shape) * 0.1 + const +def ground_truth(x, const): + return torch.sin(x) * torch.exp(-0.2 * x) + const + + +def generate_data(x, gaps=[[1, 3]], const=None, noise_std=None): + noise_dist = torch.distributions.Normal(loc=0, scale=noise_std) + y = ground_truth(x, const=const) + noise_dist.sample( + sample_shape=(len(x),) + ) msk = torch.tensor([True] * len(x)) if gaps is not None: for g in gaps: msk = msk & ~((x > g[0]) & (x < g[1])) - return x[msk], y[msk] + return x[msk], y[msk], y +const = 5.0 +noise_std = 0.1 x = torch.linspace(0, 4 * math.pi, 100) -X_train, y_train = generate_data(x, gaps=[[6, 10]]) +X_train, y_train, y_gt_train = generate_data( + x, gaps=[[6, 10]], const=const, noise_std=noise_std +) X_pred = torch.linspace( X_train[0] - 2, X_train[-1] + 2, 200, requires_grad=False ) +y_gt_pred = ground_truth(X_pred, const=const) print(f"{X_train.shape=}") print(f"{y_train.shape=}") print(f"{X_pred.shape=}") -plt.scatter(X_train, y_train, marker="o", color="tab:blue") +fig, ax = plt.subplots() +ax.scatter(X_train, y_train, marker="o", color="tab:blue", label="noisy data") +ax.plot(X_pred, y_gt_pred, ls="--", color="k", label="ground truth") +ax.legend() # - # # Define GP model @@ -112,7 +137,7 @@ plt.scatter(X_train, y_train, marker="o", color="tab:blue") # `ScaleKernel`. # # In addition, we define a constant mean via `ConstantMean`. Finally we have -# the likelihood noise $\sigma_n^2$. So in total we have 4 hyper params. +# the likelihood variance $\sigma_n^2$. So in total we have 4 hyper params. # # * $\ell$ = `model.covar_module.base_kernel.lengthscale` # * $\sigma_n^2$ = `model.likelihood.noise_covar.noise` @@ -154,26 +179,31 @@ model = ExactGPModel(X_train, y_train, likelihood) print(model) # Default start hyper params -pprint(extract_model_params(model, raw=False)) +pprint(extract_model_params(model)) # + # Set new start hyper params model.mean_module.constant = 3.0 model.covar_module.base_kernel.lengthscale = 1.0 model.covar_module.outputscale = 1.0 -model.likelihood.noise_covar.noise = 0.1 +model.likelihood.noise_covar.noise = 1e-3 -pprint(extract_model_params(model, raw=False)) +pprint(extract_model_params(model)) # - # # Sample from the GP prior # -# We sample a number of functions $f_j, j=1,\ldots,M$ from the GP prior and +# We sample a number of functions $f_m, m=1,\ldots,M$ from the GP prior and # evaluate them at all $\ma X$ = `X_pred` points, of which we have $N=200$. So -# we effectively generate samples from $p(\predve f|\ma X) = \mathcal N(\ve -# c, \ma K)$. Each sampled vector $\predve f\in\mathbb R^{N}$ and the -# covariance (kernel) matrix is $\ma K\in\mathbb R^{N\times N}$. +# we effectively generate samples from `pri_f` = $p(\predve f|\ma X) = \mathcal N(\ve +# c, \ma K)$. Each sampled vector $\predve f\in\mathbb R^{N}$ represents a +# sampled *function* $f$ evaluated the $N=200$ points in $\ma X$. The +# covariance (kernel) matrix is $\ma K\in\mathbb R^{N\times N}$. Its diagonal +# $\diag\ma K$ = `f_std**2` represents the variance at each point on the $x$-axis. +# This is what we plot as "confidence band" `f_mean` $\pm$ `2 * f_std`. +# The off-diagonal elements represent the correlation between different points +# $K_{ij} = \cov[f(\ve x_i), f(\ve x_j)]$. # + model.eval() @@ -205,11 +235,12 @@ with torch.no_grad(): # - -# Let's investigate the samples more closely. A constant mean $\ve m(\ma X) = -# \ve c$ does *not* mean that each sampled vector $\predve f$'s mean is -# equal to $c$. Instead, we have that at each $\ve x_i$, the mean of -# *all* sampled functions is the same, so $\frac{1}{M}\sum_{j=1}^M f_j(\ve x_i) -# \approx c$ and for $M\rightarrow\infty$ it will be exactly $c$. +# Let's investigate the samples more closely. First we note that the samples +# fluctuate around the mean `model.mean_module.constant` we defined above. A +# constant mean $\ve m(\ma X) = \ve c$ does *not* mean that each sampled vector +# $\predve f$'s mean is equal to $c$. Instead, we have that at each $\ve x_i$, +# the mean of *all* sampled functions is the same, so $\frac{1}{M}\sum_{j=1}^M +# f_m(\ve x_i) \approx c$ and for $M\rightarrow\infty$ it will be exactly $c$. # # Look at the first 20 x points from M=10 samples @@ -228,12 +259,13 @@ print(f"{f_samples.mean(axis=0).std()=}") # # GP posterior predictive distribution with fixed hyper params # # Now we calculate the posterior predictive distribution $p(\test{\predve -# f}|\test{\ma X}, \ma X, \ve y)$, i.e. we condition on the train data (Bayesian +# f}|\test{\ma X}, \ma X, \ve y) = \mathcal N(\test{\ve\mu}, \test{\ma\Sigma})$, +# i.e. we condition on the train data (Bayesian # inference). # # We use the fixed hyper param values defined above. In particular, since # $\sigma_n^2$ = `model.likelihood.noise_covar.noise` is > 0, we have a -# regression setting. +# regression setting -- the GP's mean doesn't interpolate all points. # + # Evaluation (predictive posterior) mode @@ -264,6 +296,14 @@ with torch.no_grad(): color="tab:red", lw=2, ) + ax.plot( + X_pred.numpy(), + y_gt_pred.numpy(), + label="ground truth", + color="k", + lw=2, + ls="--", + ) ax.fill_between( X_pred.numpy(), lower.numpy(), @@ -280,20 +320,23 @@ with torch.no_grad(): ax.legend() # - +# We observe that all sampled functions (green) and the mean (red) tend towards +# the low fixed mean function $m(\ve x)=c$ at 3.0 in the absence of data, while +# the actual data mean is `const` from above (data generation). Also the other +# hyper params ($\ell$, $\sigma_n^2$, $s$) are just guesses. Now we will +# calculate their actual value by minimizing the negative log marginal +# likelihood. # # Fit GP to data: optimize hyper params # # In each step of the optimizer, we condition on the training data (e.g. do # Bayesian inference) to calculate the posterior predictive distribution for -# the current values of the hyper params. We iterate until the log marginal +# the current values of the hyper params. We iterate until the negative log marginal # likelihood is converged. # # We use a simplistic PyTorch-style hand written train loop without convergence # control, so make sure to use enough `n_iter` and eyeball-check that the loss # is converged :-) -# -# Observe how all hyper params converge. In particular, note that the constant -# mean $m(\ve x)=c$ converges to the `const` value in `generate_data()`. # + # Train mode @@ -311,36 +354,39 @@ for ii in range(n_iter): loss.backward() optimizer.step() if (ii + 1) % 10 == 0: - print(f"iter {ii+1}/{n_iter}, {loss=:.3f}") - for p_name, p_val in extract_model_params(model).items(): + print(f"iter {ii + 1}/{n_iter}, {loss=:.3f}") + for p_name, p_val in extract_model_params(model, try_item=True).items(): history[p_name].append(p_val) history["loss"].append(loss.item()) # - -# Plot hyper params and loss (neg. log marginal likelihood) convergence +# Plot hyper params and loss (negative log marginal likelihood) convergence ncols = len(history) -fig, axs = plt.subplots(ncols=ncols, nrows=1, figsize=(ncols * 5, 5)) -for ax, (p_name, p_lst) in zip(axs, history.items()): - ax.plot(p_lst) - ax.set_title(p_name) - ax.set_xlabel("iterations") +fig, axs = plt.subplots( + ncols=ncols, nrows=1, figsize=(ncols * 3, 3), layout="compressed" +) +with torch.no_grad(): + for ax, (p_name, p_lst) in zip(axs, history.items()): + ax.plot(p_lst) + ax.set_title(p_name) + ax.set_xlabel("iterations") # Values of optimized hyper params -pprint(extract_model_params(model, raw=False)) +pprint(extract_model_params(model)) + +# We see that all hyper params converge. In particular, note that the constant +# mean $m(\ve x)=c$ converges to the `const` value in `generate_data()`. # # Run prediction # -# We show "noiseless" (left: $\sigma = \sqrt{\mathrm{diag}(\ma\Sigma)}$) vs. -# "noisy" (right: $\sigma = \sqrt{\mathrm{diag}(\ma\Sigma + \sigma_n^2\,\ma -# I_N)}$) predictions with -# -# $$\ma\Sigma = \testtest{\ma K} - \test{\ma K}\,(\ma K+\sigma_n^2\,\ma I)^{-1}\,\test{\ma K}^\top$$ +# We run prediction with two variants of the posterior predictive distribution: +# using either only the epistemic uncertainty or using the total uncertainty. # -# We find that $\ma\Sigma$ reflects behavior we would like to see from -# epistemic uncertainty -- it is high when we have no data -# (out-of-distribution). But this alone isn't the whole story. We need to add -# the estimated noise level $\sigma_n^2$ in order for the confidence band to -# cover the data. +# * epistemic: $p(\test{\predve f}|\test{\ma X}, \ma X, \ve y) = +# \mathcal N(\test{\ve\mu}, \test{\ma\Sigma})$ = `post_pred_f` with +# $\test{\ma\Sigma} = \testtest{\ma K} - \test{\ma K}\,(\ma K+\sigma_n^2\,\ma I)^{-1}\,\test{\ma K}^\top$ +# * total: $p(\test{\predve y}|\test{\ma X}, \ma X, \ve y) = +# \mathcal N(\test{\ve\mu}, \test{\ma\Sigma} + \sigma_n^2\,\ma I_N))$ = `post_pred_y` # + # Evaluation (predictive posterior) mode @@ -352,10 +398,15 @@ with torch.no_grad(): post_pred_f = model(X_pred) post_pred_y = likelihood(model(X_pred)) - fig, axs = plt.subplots(ncols=2, figsize=(12, 5)) + fig, axs = plt.subplots(ncols=2, figsize=(14, 5), sharex=True, sharey=True) fig_sigmas, ax_sigmas = plt.subplots() - for ii, (ax, post_pred, name) in enumerate( - zip(axs, [post_pred_f, post_pred_y], ["f", "y"]) + for ii, (ax, post_pred, name, title) in enumerate( + zip( + axs, + [post_pred_f, post_pred_y], + ["f", "y"], + ["epistemic uncertainty", "total uncertainty"], + ) ): yf_mean = post_pred.mean yf_samples = post_pred.sample(sample_shape=torch.Size((M,))) @@ -377,6 +428,14 @@ with torch.no_grad(): color="tab:red", lw=2, ) + ax.plot( + X_pred.numpy(), + y_gt_pred.numpy(), + label="ground truth", + color="k", + lw=2, + ls="--", + ) ax.fill_between( X_pred.numpy(), lower.numpy(), @@ -385,19 +444,20 @@ with torch.no_grad(): color="tab:orange", alpha=0.3, ) + ax.set_title(f"confidence = {title}") if name == "f": - sigma_label = r"$\pm 2\sqrt{\mathrm{diag}(\Sigma)}$" + sigma_label = r"epistemic: $\pm 2\sqrt{\mathrm{diag}(\Sigma_*)}$" zorder = 1 else: sigma_label = ( - r"$\pm 2\sqrt{\mathrm{diag}(\Sigma + \sigma_n^2\,I)}$" + r"total: $\pm 2\sqrt{\mathrm{diag}(\Sigma_* + \sigma_n^2\,I)}$" ) zorder = 0 ax_sigmas.fill_between( X_pred.numpy(), lower.numpy(), upper.numpy(), - label="confidence " + sigma_label, + label=sigma_label, color="tab:orange" if name == "f" else "tab:blue", alpha=0.5, zorder=zorder, @@ -409,11 +469,44 @@ with torch.no_grad(): plot_samples(ax, X_pred, yf_samples, label="posterior pred. samples") if ii == 1: ax.legend() + ax_sigmas.set_title("total vs. epistemic uncertainty") ax_sigmas.legend() # - +# We find that $\test{\ma\Sigma}$ reflects behavior we would like to see from +# epistemic uncertainty -- it is high when we have no data +# (out-of-distribution). But this alone isn't the whole story. We need to add +# the estimated likelihood variance $\sigma_n^2$ in order for the confidence +# band to cover the data -- this turns the estimated total uncertainty into a +# *calibrated* uncertainty. + +# # Let's check the learned noise +# +# We compare the target data noise (`noise_std`) to the learned GP noise, in +# the form of the likelihood *standard deviation* $\sigma_n$. The latter is +# equal to the $\sqrt{\cdot}$ of `likelihood.noise_covar.noise` and can also be +# calculated via $\sqrt{\diag(\test{\ma\Sigma} + \sigma_n^2\,\ma I_N) - +# \diag(\test{\ma\Sigma}})$. + # + +# Target noise to learn +print("data noise:", noise_std) + +# The two below must be the same +print( + "learned noise:", + (post_pred_y.stddev**2 - post_pred_f.stddev**2).mean().sqrt().item(), +) +print( + "learned noise:", + np.sqrt( + extract_model_params(model, try_item=True)[ + "likelihood.noise_covar.noise" + ] + ), +) +# - + # When running as script if not is_interactive(): plt.show() -# - diff --git a/BLcourse2.3/02_two_dim.ipynb b/BLcourse2.3/02_two_dim.ipynb index a87daa957597ef0730b108251a6f5c459c7c8eec..9ae348f938d2f337bd74ce53ed79520695bb6fc7 100644 --- a/BLcourse2.3/02_two_dim.ipynb +++ b/BLcourse2.3/02_two_dim.ipynb @@ -1,5 +1,27 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "8c19b309", + "metadata": {}, + "source": [ + "# About\n", + "\n", + "In this notebook, we use a GP to fit a 2D data set. We use the same ExactGP\n", + "machinery as in the 1D case and show how GPs can be used for 2D interpolation\n", + "(when data is free of noise) or regression (noisy data). Think of this as a\n", + "toy geospatial data setting. Actually, in geostatistics, Gaussian process\n", + "regression is known as [Kriging](https://en.wikipedia.org/wiki/Kriging).\n", + "$\\newcommand{\\ve}[1]{\\mathit{\\boldsymbol{#1}}}$\n", + "$\\newcommand{\\ma}[1]{\\mathbf{#1}}$\n", + "$\\newcommand{\\pred}[1]{\\rm{#1}}$\n", + "$\\newcommand{\\predve}[1]{\\mathbf{#1}}$\n", + "$\\newcommand{\\test}[1]{#1_*}$\n", + "$\\newcommand{\\testtest}[1]{#1_{**}}$\n", + "$\\DeclareMathOperator{\\diag}{diag}$\n", + "$\\DeclareMathOperator{\\cov}{cov}$" + ] + }, { "cell_type": "code", "execution_count": null, @@ -7,8 +29,8 @@ "metadata": {}, "outputs": [], "source": [ - "%matplotlib notebook\n", - "##%matplotlib widget\n", + "##%matplotlib notebook\n", + "%matplotlib widget\n", "##%matplotlib inline" ] }, @@ -53,7 +75,14 @@ "lines_to_next_cell": 2 }, "source": [ - "# Generate toy 2D data" + "# Generate toy 2D data\n", + "\n", + "Our ground truth function is $f(\\ve x) = \\sin(r) / r$ with $\\ve x =\n", + "[x_0,x_1] \\in\\mathbb R^2$ and the radial distance\n", + "$r=\\sqrt{\\ve x^\\top\\,\\ve x}$, also known as \"Mexican hat\" function, which is a\n", + "radial wave-like pattern which decays with distance from the center $\\ve\n", + "x=\\ve 0$. We generate data by random sampling 2D points $\\ve x_i$ and calculating\n", + "$y_i = f(\\ve x_i)$, optionally adding Gaussian noise further down." ] }, { @@ -146,6 +175,14 @@ "# Exercise 1" ] }, + { + "cell_type": "markdown", + "id": "c860aa4f", + "metadata": {}, + "source": [ + "Keep the settings below and explore the notebook till the end first." + ] + }, { "cell_type": "code", "execution_count": null, @@ -165,15 +202,23 @@ "# Exercise 2" ] }, + { + "cell_type": "markdown", + "id": "9f0c3b31", + "metadata": {}, + "source": [ + "First complete the notebook as is, then come back here." + ] + }, { "cell_type": "code", "execution_count": null, - "id": "6bec0f58", + "id": "5132eaeb", "metadata": {}, "outputs": [], "source": [ - "##use_noise = True\n", - "##use_gap = False" + "##use_noise = False\n", + "##use_gap = True" ] }, { @@ -184,21 +229,29 @@ "# Exercise 3" ] }, + { + "cell_type": "markdown", + "id": "bfd5d87b", + "metadata": {}, + "source": [ + "First complete the notebook with Exercise 2, then come back here." + ] + }, { "cell_type": "code", "execution_count": null, - "id": "5132eaeb", + "id": "f27dce89", "metadata": {}, "outputs": [], "source": [ - "##use_noise = False\n", + "##use_noise = True\n", "##use_gap = True" ] }, { "cell_type": "code", "execution_count": null, - "id": "6efe09b3", + "id": "cd8ac983", "metadata": {}, "outputs": [], "source": [ @@ -206,7 +259,9 @@ " # noisy train data\n", " noise_std = 0.2\n", " noise_dist = torch.distributions.Normal(loc=0, scale=noise_std)\n", - " y_train = data_train.z + noise_dist.sample_n(len(data_train.z))\n", + " y_train = data_train.z + noise_dist.sample(\n", + " sample_shape=(len(data_train.z),)\n", + " )\n", "else:\n", " # noise-free train data\n", " noise_std = 0\n", @@ -216,11 +271,12 @@ { "cell_type": "code", "execution_count": null, - "id": "af637911", + "id": "59f6f6d6", "metadata": {}, "outputs": [], "source": [ - "# Cut out part of the train data to create out-of-distribution predictions\n", + "# Cut out part of the train data to create out-of-distribution predictions.\n", + "# Same as the \"gaps\" we created in the 1D case.\n", "\n", "if use_gap:\n", " mask = (X_train[:, 0] > 0) & (X_train[:, 1] < 0)\n", @@ -254,6 +310,15 @@ "ax.set_ylabel(\"X_1\")" ] }, + { + "cell_type": "markdown", + "id": "b7ff9e6e", + "metadata": {}, + "source": [ + "The gray surface is the ground truth function. The blue points are the\n", + "training data." + ] + }, { "cell_type": "markdown", "id": "a00bf4e4", @@ -318,7 +383,7 @@ "outputs": [], "source": [ "# Default start hyper params\n", - "pprint(extract_model_params(model, raw=False))" + "pprint(extract_model_params(model))" ] }, { @@ -334,7 +399,7 @@ "model.covar_module.outputscale = 8.0\n", "model.likelihood.noise_covar.noise = 0.1\n", "\n", - "pprint(extract_model_params(model, raw=False))" + "pprint(extract_model_params(model))" ] }, { @@ -356,10 +421,10 @@ "model.train()\n", "likelihood.train()\n", "\n", - "optimizer = torch.optim.Adam(model.parameters(), lr=0.2)\n", + "optimizer = torch.optim.Adam(model.parameters(), lr=0.15)\n", "loss_func = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)\n", "\n", - "n_iter = 300\n", + "n_iter = 400\n", "history = defaultdict(list)\n", "for ii in range(n_iter):\n", " optimizer.zero_grad()\n", @@ -367,8 +432,8 @@ " loss.backward()\n", " optimizer.step()\n", " if (ii + 1) % 10 == 0:\n", - " print(f\"iter {ii+1}/{n_iter}, {loss=:.3f}\")\n", - " for p_name, p_val in extract_model_params(model).items():\n", + " print(f\"iter {ii + 1}/{n_iter}, {loss=:.3f}\")\n", + " for p_name, p_val in extract_model_params(model, try_item=True).items():\n", " history[p_name].append(p_val)\n", " history[\"loss\"].append(loss.item())" ] @@ -381,11 +446,14 @@ "outputs": [], "source": [ "ncols = len(history)\n", - "fig, axs = plt.subplots(ncols=ncols, nrows=1, figsize=(ncols * 5, 5))\n", - "for ax, (p_name, p_lst) in zip(axs, history.items()):\n", - " ax.plot(p_lst)\n", - " ax.set_title(p_name)\n", - " ax.set_xlabel(\"iterations\")" + "fig, axs = plt.subplots(\n", + " ncols=ncols, nrows=1, figsize=(ncols * 3, 3), layout=\"compressed\"\n", + ")\n", + "with torch.no_grad():\n", + " for ax, (p_name, p_lst) in zip(axs, history.items()):\n", + " ax.plot(p_lst)\n", + " ax.set_title(p_name)\n", + " ax.set_xlabel(\"iterations\")" ] }, { @@ -396,7 +464,7 @@ "outputs": [], "source": [ "# Values of optimized hyper params\n", - "pprint(extract_model_params(model, raw=False))" + "pprint(extract_model_params(model))" ] }, { @@ -411,9 +479,7 @@ "cell_type": "code", "execution_count": null, "id": "59e18623", - "metadata": { - "lines_to_next_cell": 2 - }, + "metadata": {}, "outputs": [], "source": [ "model.eval()\n", @@ -445,12 +511,58 @@ "assert (post_pred_f.mean == post_pred_y.mean).all()" ] }, + { + "cell_type": "markdown", + "id": "bff55240", + "metadata": {}, + "source": [ + "When `use_noise=False`, then the GP's prediction is an almost perfect\n", + "reconstruction of the ground truth function (in-distribution, so where we\n", + "have data).\n", + "In this case, the plot makes the GP prediction look like a perfect\n", + "*interpolation* of the noise-free data, so $\\test{\\ve\\mu} = \\ve y$ at the\n", + "train points $\\test{\\ma X} = \\ma X$. This\n", + "would be true if our GP model had exactly zero noise, so the likelihood's\n", + "$\\sigma_n^2$ would be zero. However `print(model`)\n", + "\n", + "```\n", + "ExactGPModel(\n", + " (likelihood): GaussianLikelihood(\n", + " (noise_covar): HomoskedasticNoise(\n", + " (raw_noise_constraint): GreaterThan(1.000E-04)\n", + " )\n", + " )\n", + " ...\n", + " ```\n", + "\n", + "shows that actually the min value is $10^{-4}$, so we technically always have\n", + "a regression setting, just with very small noise. The reason is that in the\n", + "GP equations, we have\n", + "\n", + "$$\\test{\\ve\\mu} = \\test{\\ma K}\\,\\left(\\ma K+\\sigma_n^2\\,\\ma I_N\\right)^{-1}\\,\\ve y$$\n", + "\n", + "where $\\sigma_n^2$ acts as a *regularization* parameter (also called \"jitter\n", + "term\" sometimes), which improves the\n", + "numerical stability of the linear system solve step\n", + "\n", + "$$\\left(\\ma K+\\sigma_n^2\\,\\ma I_N\\right)^{-1}\\,\\ve y\\:.$$\n", + "\n", + "Also we always keep $\\sigma_n^2$ as hyper parameter that we learn, and the\n", + "smallest value the hyper parameter optimization can reach is $10^{-4}$.\n", + "\n", + "While 3D plots are fun, they are not optimal for judging how well\n", + "the GP model represents the ground truth function." + ] + }, { "cell_type": "markdown", "id": "591e453d", "metadata": {}, "source": [ - "# Plot difference to ground truth and uncertainty" + "# Plot difference to ground truth and uncertainty\n", + "\n", + "Let's use contour plots to visualize the difference between GP prediction and\n", + "ground truth, as well as epistemic, total and aleatoric uncertainty." ] }, { @@ -460,8 +572,10 @@ "metadata": {}, "outputs": [], "source": [ - "ncols = 3\n", - "fig, axs = plt.subplots(ncols=ncols, nrows=1, figsize=(ncols * 7, 5))\n", + "ncols = 4\n", + "fig, axs = plt.subplots(\n", + " ncols=ncols, nrows=1, figsize=(ncols * 5, 4), layout=\"compressed\"\n", + ")\n", "\n", "vmax = post_pred_y.stddev.max()\n", "cs = []\n", @@ -477,27 +591,42 @@ ")\n", "axs[0].set_title(\"|y_pred - y_true|\")\n", "\n", + "f_std = post_pred_f.stddev.reshape((data_pred.nx, data_pred.ny))\n", + "y_std = post_pred_y.stddev.reshape((data_pred.nx, data_pred.ny))\n", + "\n", "cs.append(\n", " axs[1].contourf(\n", " data_pred.XG,\n", " data_pred.YG,\n", - " post_pred_f.stddev.reshape((data_pred.nx, data_pred.ny)),\n", + " f_std,\n", " vmin=0,\n", " vmax=vmax,\n", " )\n", ")\n", - "axs[1].set_title(\"f_std (epistemic)\")\n", + "axs[1].set_title(\"epistemic: f_std\")\n", "\n", "cs.append(\n", " axs[2].contourf(\n", " data_pred.XG,\n", " data_pred.YG,\n", - " post_pred_y.stddev.reshape((data_pred.nx, data_pred.ny)),\n", + " y_std,\n", " vmin=0,\n", " vmax=vmax,\n", " )\n", ")\n", - "axs[2].set_title(\"y_std (epistemic + aleatoric)\")\n", + "axs[2].set_title(\"total: y_std\")\n", + "\n", + "cs.append(\n", + " axs[3].contourf(\n", + " data_pred.XG,\n", + " data_pred.YG,\n", + " y_std - f_std,\n", + " vmin=0,\n", + " cmap=\"plasma\",\n", + " ##vmax=vmax,\n", + " )\n", + ")\n", + "axs[3].set_title(\"aleatoric: y_std - f_std\")\n", "\n", "for ax, c in zip(axs, cs):\n", " ax.set_xlabel(\"X_0\")\n", @@ -511,7 +640,7 @@ "id": "04257cea", "metadata": {}, "source": [ - "# Check learned noise" + "## Let's check the learned noise" ] }, { @@ -521,13 +650,22 @@ "metadata": {}, "outputs": [], "source": [ - "print((post_pred_y.stddev**2 - post_pred_f.stddev**2).mean().sqrt())\n", + "# Target noise to learn\n", + "print(\"data noise:\", noise_std)\n", + "\n", + "# The two below must be the same\n", "print(\n", - " np.sqrt(\n", - " extract_model_params(model, raw=False)[\"likelihood.noise_covar.noise\"]\n", - " )\n", + " \"learned noise:\",\n", + " (post_pred_y.stddev**2 - post_pred_f.stddev**2).mean().sqrt().item(),\n", ")\n", - "print(noise_std)" + "print(\n", + " \"learned noise:\",\n", + " np.sqrt(\n", + " extract_model_params(model, try_item=True)[\n", + " \"likelihood.noise_covar.noise\"\n", + " ]\n", + " ),\n", + ")" ] }, { @@ -535,7 +673,68 @@ "id": "1da209ff", "metadata": {}, "source": [ - "# Plot confidence bands" + "# Observations" + ] + }, + { + "cell_type": "markdown", + "id": "c52e65c1", + "metadata": {}, + "source": [ + "We have the following terms:\n", + "\n", + "* epistemic: `f_std` = $\\sqrt{\\diag\\test{\\ma\\Sigma}}$\n", + "* total: `y_std` = $\\sqrt{\\diag(\\test{\\ma\\Sigma} + \\sigma_n^2\\,\\ma I_N)}$\n", + "* aleatoric: we have two ways of representing it\n", + " * from the likelihood: $\\sigma_n$\n", + " * for plotting: we use `y_std` - `f_std`, this is $\\neq \\sigma_n$ because of the $\\sqrt{\\cdot}$\n", + " above\n", + "\n", + "We can make the following observations:\n", + "\n", + "* Exercise 1: `use_noise=False`, `use_gap=False`\n", + " * The epistemic uncertainty `f_std` is a good indicator\n", + " of the (small) differences between model prediction and ground truth\n", + " * The learned variance $\\sigma_n^2$, and hence the aleatoric uncertainty is\n", + " near zero, which makes sense for noise-free data\n", + "* Exercise 2: `use_noise=False`, `use_gap=True`\n", + " * When faced with out-of-distribution (OOD) data, the epistemic `f_std`\n", + " clearly shows where the model will make wrong (less trustworthy)\n", + " predictions\n", + "* Exercise 3: `use_noise=True`, `use_gap=True`\n", + " * in-distribution (where we have data)\n", + " * The distinction between\n", + " epistemic and aleatoric uncertainty in the way we define it is less meaningful,\n", + " hence, `f_std` doesn't correlate well with `y_pred - y_true`. The\n", + " reason is that the noise $\\sigma_n$ shows up in two parts: (a) in the\n", + " equation of $\\test{\\ma\\Sigma}$ itself, so the \"epistemic\" uncertainty\n", + " `f_std` = $\\sqrt{\\diag\\test{\\ma\\Sigma}}$ is bigger just because we have\n", + " noise (regression) and (b) we add it in $\\sqrt{\\diag(\\test{\\ma\\Sigma} +\n", + " \\sigma_n^2\\,\\ma I_N)}$ to get the total `y_std`\n", + " * The `y_std` plot looks like the `f_std` one, but shifted by a constant.\n", + " But this is not the case because we compare standard deviations and not\n", + " variances, hence `y_std` - `f_std` is not constant, and in particular\n", + " $\\neq \\sigma_n$, but both are in the same numerical range (0.15 vs. 0.2).\n", + " * out-of-distribution: `f_std` (epistemic) dominates" + ] + }, + { + "cell_type": "markdown", + "id": "08e949a2", + "metadata": {}, + "source": [ + "# Exercises\n", + "\n", + "Go back up, switch on the settings for Exercise 2 and re-run the notebook.\n", + "Same with Exercise 3." + ] + }, + { + "cell_type": "markdown", + "id": "ee89f26a", + "metadata": {}, + "source": [ + "# Bonus: plot confidence bands" ] }, { @@ -587,6 +786,18 @@ "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.3" } }, "nbformat": 4, diff --git a/BLcourse2.3/02_two_dim.py b/BLcourse2.3/02_two_dim.py index b7ee2273a279ef69e86f6d47fb1ac71d4fb13f97..6c59344978518666d3d3b2bcd11fcea1b4a5b0d3 100644 --- a/BLcourse2.3/02_two_dim.py +++ b/BLcourse2.3/02_two_dim.py @@ -6,18 +6,32 @@ # extension: .py # format_name: light # format_version: '1.5' -# jupytext_version: 1.16.2 +# jupytext_version: 1.17.1 # kernelspec: # display_name: Python 3 (ipykernel) # language: python # name: python3 # --- -# + -# %matplotlib notebook -# ##%matplotlib widget +# # About +# +# In this notebook, we use a GP to fit a 2D data set. We use the same ExactGP +# machinery as in the 1D case and show how GPs can be used for 2D interpolation +# (when data is free of noise) or regression (noisy data). Think of this as a +# toy geospatial data setting. Actually, in geostatistics, Gaussian process +# regression is known as [Kriging](https://en.wikipedia.org/wiki/Kriging). +# $\newcommand{\ve}[1]{\mathit{\boldsymbol{#1}}}$ +# $\newcommand{\ma}[1]{\mathbf{#1}}$ +# $\newcommand{\pred}[1]{\rm{#1}}$ +# $\newcommand{\predve}[1]{\mathbf{#1}}$ +# $\newcommand{\test}[1]{#1_*}$ +# $\newcommand{\testtest}[1]{#1_{**}}$ +# $\DeclareMathOperator{\diag}{diag}$ +# $\DeclareMathOperator{\cov}{cov}$ + +# ##%matplotlib notebook +# %matplotlib widget # ##%matplotlib inline -# - # + from collections import defaultdict @@ -39,6 +53,13 @@ torch.set_default_dtype(torch.float64) torch.manual_seed(123) # # Generate toy 2D data +# +# Our ground truth function is $f(\ve x) = \sin(r) / r$ with $\ve x = +# [x_0,x_1] \in\mathbb R^2$ and the radial distance +# $r=\sqrt{\ve x^\top\,\ve x}$, also known as "Mexican hat" function, which is a +# radial wave-like pattern which decays with distance from the center $\ve +# x=\ve 0$. We generate data by random sampling 2D points $\ve x_i$ and calculating +# $y_i = f(\ve x_i)$, optionally adding Gaussian noise further down. class MexicanHat: @@ -112,39 +133,44 @@ X_pred = data_pred.X # # Exercise 1 -# + +# Keep the settings below and explore the notebook till the end first. + use_noise = False use_gap = False -# - # # Exercise 2 +# First complete the notebook as is, then come back here. + # + -##use_noise = True -##use_gap = False +##use_noise = False +##use_gap = True # - # # Exercise 3 +# First complete the notebook with Exercise 2, then come back here. + # + -##use_noise = False +##use_noise = True ##use_gap = True # - -# + if use_noise: # noisy train data noise_std = 0.2 noise_dist = torch.distributions.Normal(loc=0, scale=noise_std) - y_train = data_train.z + noise_dist.sample_n(len(data_train.z)) + y_train = data_train.z + noise_dist.sample( + sample_shape=(len(data_train.z),) + ) else: # noise-free train data noise_std = 0 y_train = data_train.z -# - # + -# Cut out part of the train data to create out-of-distribution predictions +# Cut out part of the train data to create out-of-distribution predictions. +# Same as the "gaps" we created in the 1D case. if use_gap: mask = (X_train[:, 0] > 0) & (X_train[:, 1] < 0) @@ -170,6 +196,9 @@ s1 = ax.scatter( ax.set_xlabel("X_0") ax.set_ylabel("X_1") +# The gray surface is the ground truth function. The blue points are the +# training data. + # # Define GP model @@ -206,7 +235,7 @@ model = ExactGPModel(X_train, y_train, likelihood) print(model) # Default start hyper params -pprint(extract_model_params(model, raw=False)) +pprint(extract_model_params(model)) # + # Set new start hyper params @@ -215,7 +244,7 @@ model.covar_module.base_kernel.lengthscale = 3.0 model.covar_module.outputscale = 8.0 model.likelihood.noise_covar.noise = 0.1 -pprint(extract_model_params(model, raw=False)) +pprint(extract_model_params(model)) # - # # Fit GP to data: optimize hyper params @@ -225,10 +254,10 @@ pprint(extract_model_params(model, raw=False)) model.train() likelihood.train() -optimizer = torch.optim.Adam(model.parameters(), lr=0.2) +optimizer = torch.optim.Adam(model.parameters(), lr=0.15) loss_func = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model) -n_iter = 300 +n_iter = 400 history = defaultdict(list) for ii in range(n_iter): optimizer.zero_grad() @@ -236,21 +265,24 @@ for ii in range(n_iter): loss.backward() optimizer.step() if (ii + 1) % 10 == 0: - print(f"iter {ii+1}/{n_iter}, {loss=:.3f}") - for p_name, p_val in extract_model_params(model).items(): + print(f"iter {ii + 1}/{n_iter}, {loss=:.3f}") + for p_name, p_val in extract_model_params(model, try_item=True).items(): history[p_name].append(p_val) history["loss"].append(loss.item()) # - ncols = len(history) -fig, axs = plt.subplots(ncols=ncols, nrows=1, figsize=(ncols * 5, 5)) -for ax, (p_name, p_lst) in zip(axs, history.items()): - ax.plot(p_lst) - ax.set_title(p_name) - ax.set_xlabel("iterations") +fig, axs = plt.subplots( + ncols=ncols, nrows=1, figsize=(ncols * 3, 3), layout="compressed" +) +with torch.no_grad(): + for ax, (p_name, p_lst) in zip(axs, history.items()): + ax.plot(p_lst) + ax.set_title(p_name) + ax.set_xlabel("iterations") # Values of optimized hyper params -pprint(extract_model_params(model, raw=False)) +pprint(extract_model_params(model)) # # Run prediction @@ -284,12 +316,53 @@ with torch.no_grad(): assert (post_pred_f.mean == post_pred_y.mean).all() # - +# When `use_noise=False`, then the GP's prediction is an almost perfect +# reconstruction of the ground truth function (in-distribution, so where we +# have data). +# In this case, the plot makes the GP prediction look like a perfect +# *interpolation* of the noise-free data, so $\test{\ve\mu} = \ve y$ at the +# train points $\test{\ma X} = \ma X$. This +# would be true if our GP model had exactly zero noise, so the likelihood's +# $\sigma_n^2$ would be zero. However `print(model`) +# +# ``` +# ExactGPModel( +# (likelihood): GaussianLikelihood( +# (noise_covar): HomoskedasticNoise( +# (raw_noise_constraint): GreaterThan(1.000E-04) +# ) +# ) +# ... +# ``` +# +# shows that actually the min value is $10^{-4}$, so we technically always have +# a regression setting, just with very small noise. The reason is that in the +# GP equations, we have +# +# $$\test{\ve\mu} = \test{\ma K}\,\left(\ma K+\sigma_n^2\,\ma I_N\right)^{-1}\,\ve y$$ +# +# where $\sigma_n^2$ acts as a *regularization* parameter (also called "jitter +# term" sometimes), which improves the +# numerical stability of the linear system solve step +# +# $$\left(\ma K+\sigma_n^2\,\ma I_N\right)^{-1}\,\ve y\:.$$ +# +# Also we always keep $\sigma_n^2$ as hyper parameter that we learn, and the +# smallest value the hyper parameter optimization can reach is $10^{-4}$. +# +# While 3D plots are fun, they are not optimal for judging how well +# the GP model represents the ground truth function. # # Plot difference to ground truth and uncertainty +# +# Let's use contour plots to visualize the difference between GP prediction and +# ground truth, as well as epistemic, total and aleatoric uncertainty. # + -ncols = 3 -fig, axs = plt.subplots(ncols=ncols, nrows=1, figsize=(ncols * 7, 5)) +ncols = 4 +fig, axs = plt.subplots( + ncols=ncols, nrows=1, figsize=(ncols * 5, 4), layout="compressed" +) vmax = post_pred_y.stddev.max() cs = [] @@ -305,27 +378,42 @@ cs.append( ) axs[0].set_title("|y_pred - y_true|") +f_std = post_pred_f.stddev.reshape((data_pred.nx, data_pred.ny)) +y_std = post_pred_y.stddev.reshape((data_pred.nx, data_pred.ny)) + cs.append( axs[1].contourf( data_pred.XG, data_pred.YG, - post_pred_f.stddev.reshape((data_pred.nx, data_pred.ny)), + f_std, vmin=0, vmax=vmax, ) ) -axs[1].set_title("f_std (epistemic)") +axs[1].set_title("epistemic: f_std") cs.append( axs[2].contourf( data_pred.XG, data_pred.YG, - post_pred_y.stddev.reshape((data_pred.nx, data_pred.ny)), + y_std, vmin=0, vmax=vmax, ) ) -axs[2].set_title("y_std (epistemic + aleatoric)") +axs[2].set_title("total: y_std") + +cs.append( + axs[3].contourf( + data_pred.XG, + data_pred.YG, + y_std - f_std, + vmin=0, + cmap="plasma", + ##vmax=vmax, + ) +) +axs[3].set_title("aleatoric: y_std - f_std") for ax, c in zip(axs, cs): ax.set_xlabel("X_0") @@ -334,19 +422,71 @@ for ax, c in zip(axs, cs): fig.colorbar(c, ax=ax) # - -# # Check learned noise +# ## Let's check the learned noise # + -print((post_pred_y.stddev**2 - post_pred_f.stddev**2).mean().sqrt()) +# Target noise to learn +print("data noise:", noise_std) + +# The two below must be the same +print( + "learned noise:", + (post_pred_y.stddev**2 - post_pred_f.stddev**2).mean().sqrt().item(), +) print( + "learned noise:", np.sqrt( - extract_model_params(model, raw=False)["likelihood.noise_covar.noise"] - ) + extract_model_params(model, try_item=True)[ + "likelihood.noise_covar.noise" + ] + ), ) -print(noise_std) # - -# # Plot confidence bands +# # Observations + +# We have the following terms: +# +# * epistemic: `f_std` = $\sqrt{\diag\test{\ma\Sigma}}$ +# * total: `y_std` = $\sqrt{\diag(\test{\ma\Sigma} + \sigma_n^2\,\ma I_N)}$ +# * aleatoric: we have two ways of representing it +# * from the likelihood: $\sigma_n$ +# * for plotting: we use `y_std` - `f_std`, this is $\neq \sigma_n$ because of the $\sqrt{\cdot}$ +# above +# +# We can make the following observations: +# +# * Exercise 1: `use_noise=False`, `use_gap=False` +# * The epistemic uncertainty `f_std` is a good indicator +# of the (small) differences between model prediction and ground truth +# * The learned variance $\sigma_n^2$, and hence the aleatoric uncertainty is +# near zero, which makes sense for noise-free data +# * Exercise 2: `use_noise=False`, `use_gap=True` +# * When faced with out-of-distribution (OOD) data, the epistemic `f_std` +# clearly shows where the model will make wrong (less trustworthy) +# predictions +# * Exercise 3: `use_noise=True`, `use_gap=True` +# * in-distribution (where we have data) +# * The distinction between +# epistemic and aleatoric uncertainty in the way we define it is less meaningful, +# hence, `f_std` doesn't correlate well with `y_pred - y_true`. The +# reason is that the noise $\sigma_n$ shows up in two parts: (a) in the +# equation of $\test{\ma\Sigma}$ itself, so the "epistemic" uncertainty +# `f_std` = $\sqrt{\diag\test{\ma\Sigma}}$ is bigger just because we have +# noise (regression) and (b) we add it in $\sqrt{\diag(\test{\ma\Sigma} + +# \sigma_n^2\,\ma I_N)}$ to get the total `y_std` +# * The `y_std` plot looks like the `f_std` one, but shifted by a constant. +# But this is not the case because we compare standard deviations and not +# variances, hence `y_std` - `f_std` is not constant, and in particular +# $\neq \sigma_n$, but both are in the same numerical range (0.15 vs. 0.2). +# * out-of-distribution: `f_std` (epistemic) dominates + +# # Exercises +# +# Go back up, switch on the settings for Exercise 2 and re-run the notebook. +# Same with Exercise 3. + +# # Bonus: plot confidence bands # + y_mean = post_pred_y.mean.reshape((data_pred.nx, data_pred.ny)) @@ -370,8 +510,6 @@ ax.set_zlim((contour_z, zlim[1] + abs(contour_z))) ax.contourf(data_pred.XG, data_pred.YG, y_std, zdir="z", offset=contour_z) # - -# + # When running as script if not is_interactive(): plt.show() -# - diff --git a/BLcourse2.3/03_one_dim_SVI.ipynb b/BLcourse2.3/03_one_dim_SVI.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..0520fa0366033d88482c76a09f7bf646e7ba4eed --- /dev/null +++ b/BLcourse2.3/03_one_dim_SVI.ipynb @@ -0,0 +1,536 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b146bc52", + "metadata": {}, + "source": [ + "# About\n", + "\n", + "In this notebook, we replace the ExactGP inference and log marginal\n", + "likelihood optimization by using sparse stochastic variational inference.\n", + "This serves as an example of the many methods `gpytorch` offers to make GPs\n", + "scale to large data sets.\n", + "$\\newcommand{\\ve}[1]{\\mathit{\\boldsymbol{#1}}}$\n", + "$\\newcommand{\\ma}[1]{\\mathbf{#1}}$\n", + "$\\newcommand{\\pred}[1]{\\rm{#1}}$\n", + "$\\newcommand{\\predve}[1]{\\mathbf{#1}}$\n", + "$\\newcommand{\\test}[1]{#1_*}$\n", + "$\\newcommand{\\testtest}[1]{#1_{**}}$\n", + "$\\DeclareMathOperator{\\diag}{diag}$\n", + "$\\DeclareMathOperator{\\cov}{cov}$" + ] + }, + { + "cell_type": "markdown", + "id": "f96e3304", + "metadata": {}, + "source": [ + "# Imports, helpers, setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "193b2f13", + "metadata": {}, + "outputs": [], + "source": [ + "##%matplotlib notebook\n", + "%matplotlib widget\n", + "##%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c11c1030", + "metadata": {}, + "outputs": [], + "source": [ + "import math\n", + "from collections import defaultdict\n", + "from pprint import pprint\n", + "\n", + "import torch\n", + "import gpytorch\n", + "from matplotlib import pyplot as plt\n", + "from matplotlib import is_interactive\n", + "import numpy as np\n", + "from torch.utils.data import TensorDataset, DataLoader\n", + "\n", + "from utils import extract_model_params, plot_samples\n", + "\n", + "\n", + "torch.set_default_dtype(torch.float64)\n", + "torch.manual_seed(123)" + ] + }, + { + "cell_type": "markdown", + "id": "460d4c3d", + "metadata": { + "lines_to_next_cell": 2 + }, + "source": [ + "# Generate toy 1D data\n", + "\n", + "Now we generate 10x more points as in the ExactGP case, still the inference\n", + "won't be much slower (exact GPs scale roughly as $N^3$). Note that the data we\n", + "use here is still tiny (1000 points is easy even for exact GPs), so the\n", + "method's usefulness cannot be fully exploited in our example\n", + "-- also we don't even use a GPU yet :)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f1a9647d", + "metadata": {}, + "outputs": [], + "source": [ + "def ground_truth(x, const):\n", + " return torch.sin(x) * torch.exp(-0.2 * x) + const\n", + "\n", + "\n", + "def generate_data(x, gaps=[[1, 3]], const=None, noise_std=None):\n", + " noise_dist = torch.distributions.Normal(loc=0, scale=noise_std)\n", + " y = ground_truth(x, const=const) + noise_dist.sample(\n", + " sample_shape=(len(x),)\n", + " )\n", + " msk = torch.tensor([True] * len(x))\n", + " if gaps is not None:\n", + " for g in gaps:\n", + " msk = msk & ~((x > g[0]) & (x < g[1]))\n", + " return x[msk], y[msk], y\n", + "\n", + "\n", + "const = 5.0\n", + "noise_std = 0.1\n", + "x = torch.linspace(0, 4 * math.pi, 1000)\n", + "X_train, y_train, y_gt_train = generate_data(\n", + " x, gaps=[[6, 10]], const=const, noise_std=noise_std\n", + ")\n", + "X_pred = torch.linspace(\n", + " X_train[0] - 2, X_train[-1] + 2, 200, requires_grad=False\n", + ")\n", + "y_gt_pred = ground_truth(X_pred, const=const)\n", + "\n", + "print(f\"{X_train.shape=}\")\n", + "print(f\"{y_train.shape=}\")\n", + "print(f\"{X_pred.shape=}\")\n", + "\n", + "fig, ax = plt.subplots()\n", + "ax.scatter(X_train, y_train, marker=\"o\", color=\"tab:blue\", label=\"noisy data\")\n", + "ax.plot(X_pred, y_gt_pred, ls=\"--\", color=\"k\", label=\"ground truth\")\n", + "ax.legend()" + ] + }, + { + "cell_type": "markdown", + "id": "4ad60c5e", + "metadata": { + "lines_to_next_cell": 2 + }, + "source": [ + "# Define GP model\n", + "\n", + "The model follows [this\n", + "example](https://docs.gpytorch.ai/en/stable/examples/04_Variational_and_Approximate_GPs/SVGP_Regression_CUDA.html)\n", + "based on [Hensman et al., \"Scalable Variational Gaussian Process Classification\",\n", + "2015](https://proceedings.mlr.press/v38/hensman15.html). The model is\n", + "\"sparse\" since it works with a set of *inducing* points $(\\ma Z, \\ve u),\n", + "\\ve u=f(\\ma Z)$ which is much smaller than the train data $(\\ma X, \\ve y)$.\n", + "\n", + "We have the same hyper parameters as before\n", + "\n", + "* $\\ell$ = `model.covar_module.base_kernel.lengthscale`\n", + "* $\\sigma_n^2$ = `likelihood.noise_covar.noise`\n", + "* $s$ = `model.covar_module.outputscale`\n", + "* $m(\\ve x) = c$ = `model.mean_module.constant`\n", + "\n", + "plus additional ones, introduced by the approximations used:\n", + "\n", + "* the learnable inducing points $\\ma Z$ for the variational distribution\n", + " $q_{\\ve\\psi}(\\ve u)$\n", + "* learnable parameters of the variational distribution $q_{\\ve\\psi}(\\ve u)=\\mathcal N(\\ve m_u, \\ma S)$: the\n", + " variational mean $\\ve m_u$ and covariance $\\ma S$ in form a lower triangular\n", + " matrix $\\ma L$ such that $\\ma S=\\ma L\\,\\ma L^\\top$" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "444929a3", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "class ApproxGPModel(gpytorch.models.ApproximateGP):\n", + " def __init__(self, Z):\n", + " # Approximate inducing value posterior q(u), u = f(Z), Z = inducing\n", + " # points (subset of X_train)\n", + " variational_distribution = (\n", + " gpytorch.variational.CholeskyVariationalDistribution(Z.size(0))\n", + " )\n", + " # Compute q(f(X)) from q(u)\n", + " variational_strategy = gpytorch.variational.VariationalStrategy(\n", + " self,\n", + " Z,\n", + " variational_distribution,\n", + " learn_inducing_locations=True,\n", + " )\n", + " super().__init__(variational_strategy)\n", + " self.mean_module = gpytorch.means.ConstantMean()\n", + " self.covar_module = gpytorch.kernels.ScaleKernel(\n", + " gpytorch.kernels.RBFKernel()\n", + " )\n", + "\n", + " def forward(self, x):\n", + " mean_x = self.mean_module(x)\n", + " covar_x = self.covar_module(x)\n", + " return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)\n", + "\n", + "\n", + "likelihood = gpytorch.likelihoods.GaussianLikelihood()\n", + "\n", + "n_train = len(X_train)\n", + "# Start values for inducing points Z, use 10% random sub-sample of X_train.\n", + "ind_points_fraction = 0.1\n", + "ind_idxs = torch.randperm(n_train)[: int(n_train * ind_points_fraction)]\n", + "model = ApproxGPModel(Z=X_train[ind_idxs])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5861163d", + "metadata": {}, + "outputs": [], + "source": [ + "# Inspect the model\n", + "print(model)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "98032457", + "metadata": {}, + "outputs": [], + "source": [ + "# Inspect the likelihood. In contrast to ExactGP, the likelihood is not part of\n", + "# the GP model instance.\n", + "print(likelihood)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ec97687", + "metadata": {}, + "outputs": [], + "source": [ + "# Default start hyper params\n", + "print(\"model params:\")\n", + "pprint(extract_model_params(model))\n", + "print(\"likelihood params:\")\n", + "pprint(extract_model_params(likelihood))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "167d584c", + "metadata": {}, + "outputs": [], + "source": [ + "# Set new start hyper params (scalars only)\n", + "model.mean_module.constant = 3.0\n", + "model.covar_module.base_kernel.lengthscale = 1.0\n", + "model.covar_module.outputscale = 1.0\n", + "likelihood.noise_covar.noise = 0.3" + ] + }, + { + "cell_type": "markdown", + "id": "6704ce2f", + "metadata": {}, + "source": [ + "# Fit GP to data: optimize hyper params\n", + "\n", + "Now we optimize the GP hyper parameters by doing a GP-specific variational inference (VI),\n", + "where we optimize not the log marginal likelihood (ExactGP case),\n", + "but an ELBO (evidence lower bound) objective\n", + "\n", + "$$\n", + "\\max_\\ve\\zeta\\left(\\mathbb E_{q_{\\ve\\psi}(\\ve u)}\\left[\\ln p(\\ve y|\\ve u) \\right] -\n", + "D_{\\text{KL}}(q_{\\ve\\psi}(\\ve u)\\Vert p(\\ve u))\\right)\n", + "$$\n", + "\n", + "with respect to\n", + "\n", + "$$\\ve\\zeta = [\\ell, \\sigma_n^2, s, c, \\ve\\psi] $$\n", + "\n", + "with\n", + "\n", + "$$\\ve\\psi = [\\ve m_u, \\ma Z, \\ma L]$$\n", + "\n", + "the parameters of the variational distribution $q_{\\ve\\psi}(\\ve u)$.\n", + "\n", + "In addition, we perform a stochastic\n", + "optimization by using a deep learning type mini-batch loop, hence\n", + "\"stochastic\" variational inference (SVI). The latter speeds up the\n", + "optimization since we only look at a fraction of data per optimizer step to\n", + "calculate an approximate loss gradient (`loss.backward()`)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f39d86f8", + "metadata": {}, + "outputs": [], + "source": [ + "# Train mode\n", + "model.train()\n", + "likelihood.train()\n", + "\n", + "optimizer = torch.optim.Adam(\n", + " [dict(params=model.parameters()), dict(params=likelihood.parameters())],\n", + " lr=0.1,\n", + ")\n", + "loss_func = gpytorch.mlls.VariationalELBO(\n", + " likelihood, model, num_data=X_train.shape[0]\n", + ")\n", + "\n", + "train_dl = DataLoader(\n", + " TensorDataset(X_train, y_train), batch_size=128, shuffle=True\n", + ")\n", + "\n", + "n_iter = 200\n", + "history = defaultdict(list)\n", + "for i_iter in range(n_iter):\n", + " for i_batch, (X_batch, y_batch) in enumerate(train_dl):\n", + " batch_history = defaultdict(list)\n", + " optimizer.zero_grad()\n", + " loss = -loss_func(model(X_batch), y_batch)\n", + " loss.backward()\n", + " optimizer.step()\n", + " param_dct = dict()\n", + " param_dct.update(extract_model_params(model, try_item=True))\n", + " param_dct.update(extract_model_params(likelihood, try_item=True))\n", + " for p_name, p_val in param_dct.items():\n", + " if isinstance(p_val, float):\n", + " batch_history[p_name].append(p_val)\n", + " batch_history[\"loss\"].append(loss.item())\n", + " for p_name, p_lst in batch_history.items():\n", + " history[p_name].append(np.mean(p_lst))\n", + " if (i_iter + 1) % 10 == 0:\n", + " print(f\"iter {i_iter + 1}/{n_iter}, {loss=:.3f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dc39b5d9", + "metadata": {}, + "outputs": [], + "source": [ + "# Plot scalar hyper params and loss (ELBO) convergence\n", + "ncols = len(history)\n", + "fig, axs = plt.subplots(\n", + " ncols=ncols, nrows=1, figsize=(ncols * 3, 3), layout=\"compressed\"\n", + ")\n", + "with torch.no_grad():\n", + " for ax, (p_name, p_lst) in zip(axs, history.items()):\n", + " ax.plot(p_lst)\n", + " ax.set_title(p_name)\n", + " ax.set_xlabel(\"iterations\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c4907e39", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "# Values of optimized hyper params\n", + "print(\"model params:\")\n", + "pprint(extract_model_params(model))\n", + "print(\"likelihood params:\")\n", + "pprint(extract_model_params(likelihood))" + ] + }, + { + "cell_type": "markdown", + "id": "4ae35e11", + "metadata": {}, + "source": [ + "# Run prediction" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "629b4658", + "metadata": {}, + "outputs": [], + "source": [ + "# Evaluation (predictive posterior) mode\n", + "model.eval()\n", + "likelihood.eval()\n", + "\n", + "with torch.no_grad():\n", + " M = 10\n", + " post_pred_f = model(X_pred)\n", + " post_pred_y = likelihood(model(X_pred))\n", + "\n", + " fig, axs = plt.subplots(ncols=2, figsize=(14, 5), sharex=True, sharey=True)\n", + " fig_sigmas, ax_sigmas = plt.subplots()\n", + " for ii, (ax, post_pred, name, title) in enumerate(\n", + " zip(\n", + " axs,\n", + " [post_pred_f, post_pred_y],\n", + " [\"f\", \"y\"],\n", + " [\"epistemic uncertainty\", \"total uncertainty\"],\n", + " )\n", + " ):\n", + " yf_mean = post_pred.mean\n", + " yf_samples = post_pred.sample(sample_shape=torch.Size((M,)))\n", + "\n", + " yf_std = post_pred.stddev\n", + " lower = yf_mean - 2 * yf_std\n", + " upper = yf_mean + 2 * yf_std\n", + " ax.plot(\n", + " X_train.numpy(),\n", + " y_train.numpy(),\n", + " \"o\",\n", + " label=\"data\",\n", + " color=\"tab:blue\",\n", + " )\n", + " ax.plot(\n", + " X_pred.numpy(),\n", + " yf_mean.numpy(),\n", + " label=\"mean\",\n", + " color=\"tab:red\",\n", + " lw=2,\n", + " )\n", + " ax.plot(\n", + " X_pred.numpy(),\n", + " y_gt_pred.numpy(),\n", + " label=\"ground truth\",\n", + " color=\"k\",\n", + " lw=2,\n", + " ls=\"--\",\n", + " )\n", + " ax.fill_between(\n", + " X_pred.numpy(),\n", + " lower.numpy(),\n", + " upper.numpy(),\n", + " label=\"confidence\",\n", + " color=\"tab:orange\",\n", + " alpha=0.3,\n", + " )\n", + " ax.set_title(f\"confidence = {title}\")\n", + " if name == \"f\":\n", + " sigma_label = r\"epistemic: $\\pm 2\\sqrt{\\mathrm{diag}(\\Sigma_*)}$\"\n", + " zorder = 1\n", + " else:\n", + " sigma_label = (\n", + " r\"total: $\\pm 2\\sqrt{\\mathrm{diag}(\\Sigma_* + \\sigma_n^2\\,I)}$\"\n", + " )\n", + " zorder = 0\n", + " ax_sigmas.fill_between(\n", + " X_pred.numpy(),\n", + " lower.numpy(),\n", + " upper.numpy(),\n", + " label=sigma_label,\n", + " color=\"tab:orange\" if name == \"f\" else \"tab:blue\",\n", + " alpha=0.5,\n", + " zorder=zorder,\n", + " )\n", + " y_min = y_train.min()\n", + " y_max = y_train.max()\n", + " y_span = y_max - y_min\n", + " ax.set_ylim([y_min - 0.3 * y_span, y_max + 0.3 * y_span])\n", + " plot_samples(ax, X_pred, yf_samples, label=\"posterior pred. samples\")\n", + " if ii == 1:\n", + " ax.legend()\n", + " ax_sigmas.set_title(\"total vs. epistemic uncertainty\")\n", + " ax_sigmas.legend()" + ] + }, + { + "cell_type": "markdown", + "id": "60abb7ec", + "metadata": {}, + "source": [ + "# Let's check the learned noise" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d8a32f5b", + "metadata": {}, + "outputs": [], + "source": [ + "# Target noise to learn\n", + "print(\"data noise:\", noise_std)\n", + "\n", + "# The two below must be the same\n", + "print(\n", + " \"learned noise:\",\n", + " (post_pred_y.stddev**2 - post_pred_f.stddev**2).mean().sqrt().item(),\n", + ")\n", + "print(\n", + " \"learned noise:\",\n", + " np.sqrt(\n", + " extract_model_params(likelihood, try_item=True)[\"noise_covar.noise\"]\n", + " ),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0637729a", + "metadata": {}, + "outputs": [], + "source": [ + "# When running as script\n", + "if not is_interactive():\n", + " plt.show()" + ] + } + ], + "metadata": { + "jupytext": { + "formats": "ipynb,py:light" + }, + "kernelspec": { + "display_name": "bayes-ml-course", + "language": "python", + "name": "bayes-ml-course" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/BLcourse2.3/03_one_dim_SVI.py b/BLcourse2.3/03_one_dim_SVI.py new file mode 100644 index 0000000000000000000000000000000000000000..644e61a37b818301dd4604bb25d74b64cb5be511 --- /dev/null +++ b/BLcourse2.3/03_one_dim_SVI.py @@ -0,0 +1,376 @@ +# --- +# jupyter: +# jupytext: +# formats: ipynb,py:light +# text_representation: +# extension: .py +# format_name: light +# format_version: '1.5' +# jupytext_version: 1.17.1 +# kernelspec: +# display_name: bayes-ml-course +# language: python +# name: bayes-ml-course +# --- + +# # About +# +# In this notebook, we replace the ExactGP inference and log marginal +# likelihood optimization by using sparse stochastic variational inference. +# This serves as an example of the many methods `gpytorch` offers to make GPs +# scale to large data sets. +# $\newcommand{\ve}[1]{\mathit{\boldsymbol{#1}}}$ +# $\newcommand{\ma}[1]{\mathbf{#1}}$ +# $\newcommand{\pred}[1]{\rm{#1}}$ +# $\newcommand{\predve}[1]{\mathbf{#1}}$ +# $\newcommand{\test}[1]{#1_*}$ +# $\newcommand{\testtest}[1]{#1_{**}}$ +# $\DeclareMathOperator{\diag}{diag}$ +# $\DeclareMathOperator{\cov}{cov}$ + +# # Imports, helpers, setup + +# ##%matplotlib notebook +# %matplotlib widget +# ##%matplotlib inline + +# + +import math +from collections import defaultdict +from pprint import pprint + +import torch +import gpytorch +from matplotlib import pyplot as plt +from matplotlib import is_interactive +import numpy as np +from torch.utils.data import TensorDataset, DataLoader + +from utils import extract_model_params, plot_samples + + +torch.set_default_dtype(torch.float64) +torch.manual_seed(123) +# - + +# # Generate toy 1D data +# +# Now we generate 10x more points as in the ExactGP case, still the inference +# won't be much slower (exact GPs scale roughly as $N^3$). Note that the data we +# use here is still tiny (1000 points is easy even for exact GPs), so the +# method's usefulness cannot be fully exploited in our example +# -- also we don't even use a GPU yet :). + + +# + +def ground_truth(x, const): + return torch.sin(x) * torch.exp(-0.2 * x) + const + + +def generate_data(x, gaps=[[1, 3]], const=None, noise_std=None): + noise_dist = torch.distributions.Normal(loc=0, scale=noise_std) + y = ground_truth(x, const=const) + noise_dist.sample( + sample_shape=(len(x),) + ) + msk = torch.tensor([True] * len(x)) + if gaps is not None: + for g in gaps: + msk = msk & ~((x > g[0]) & (x < g[1])) + return x[msk], y[msk], y + + +const = 5.0 +noise_std = 0.1 +x = torch.linspace(0, 4 * math.pi, 1000) +X_train, y_train, y_gt_train = generate_data( + x, gaps=[[6, 10]], const=const, noise_std=noise_std +) +X_pred = torch.linspace( + X_train[0] - 2, X_train[-1] + 2, 200, requires_grad=False +) +y_gt_pred = ground_truth(X_pred, const=const) + +print(f"{X_train.shape=}") +print(f"{y_train.shape=}") +print(f"{X_pred.shape=}") + +fig, ax = plt.subplots() +ax.scatter(X_train, y_train, marker="o", color="tab:blue", label="noisy data") +ax.plot(X_pred, y_gt_pred, ls="--", color="k", label="ground truth") +ax.legend() +# - + +# # Define GP model +# +# The model follows [this +# example](https://docs.gpytorch.ai/en/stable/examples/04_Variational_and_Approximate_GPs/SVGP_Regression_CUDA.html) +# based on [Hensman et al., "Scalable Variational Gaussian Process Classification", +# 2015](https://proceedings.mlr.press/v38/hensman15.html). The model is +# "sparse" since it works with a set of *inducing* points $(\ma Z, \ve u), +# \ve u=f(\ma Z)$ which is much smaller than the train data $(\ma X, \ve y)$. +# +# We have the same hyper parameters as before +# +# * $\ell$ = `model.covar_module.base_kernel.lengthscale` +# * $\sigma_n^2$ = `likelihood.noise_covar.noise` +# * $s$ = `model.covar_module.outputscale` +# * $m(\ve x) = c$ = `model.mean_module.constant` +# +# plus additional ones, introduced by the approximations used: +# +# * the learnable inducing points $\ma Z$ for the variational distribution +# $q_{\ve\psi}(\ve u)$ +# * learnable parameters of the variational distribution $q_{\ve\psi}(\ve u)=\mathcal N(\ve m_u, \ma S)$: the +# variational mean $\ve m_u$ and covariance $\ma S$ in form a lower triangular +# matrix $\ma L$ such that $\ma S=\ma L\,\ma L^\top$ + + +# + +class ApproxGPModel(gpytorch.models.ApproximateGP): + def __init__(self, Z): + # Approximate inducing value posterior q(u), u = f(Z), Z = inducing + # points (subset of X_train) + variational_distribution = ( + gpytorch.variational.CholeskyVariationalDistribution(Z.size(0)) + ) + # Compute q(f(X)) from q(u) + variational_strategy = gpytorch.variational.VariationalStrategy( + self, + Z, + variational_distribution, + learn_inducing_locations=True, + ) + super().__init__(variational_strategy) + self.mean_module = gpytorch.means.ConstantMean() + self.covar_module = gpytorch.kernels.ScaleKernel( + gpytorch.kernels.RBFKernel() + ) + + def forward(self, x): + mean_x = self.mean_module(x) + covar_x = self.covar_module(x) + return gpytorch.distributions.MultivariateNormal(mean_x, covar_x) + + +likelihood = gpytorch.likelihoods.GaussianLikelihood() + +n_train = len(X_train) +# Start values for inducing points Z, use 10% random sub-sample of X_train. +ind_points_fraction = 0.1 +ind_idxs = torch.randperm(n_train)[: int(n_train * ind_points_fraction)] +model = ApproxGPModel(Z=X_train[ind_idxs]) +# - + + +# Inspect the model +print(model) + +# Inspect the likelihood. In contrast to ExactGP, the likelihood is not part of +# the GP model instance. +print(likelihood) + +# Default start hyper params +print("model params:") +pprint(extract_model_params(model)) +print("likelihood params:") +pprint(extract_model_params(likelihood)) + +# Set new start hyper params (scalars only) +model.mean_module.constant = 3.0 +model.covar_module.base_kernel.lengthscale = 1.0 +model.covar_module.outputscale = 1.0 +likelihood.noise_covar.noise = 0.3 + +# # Fit GP to data: optimize hyper params +# +# Now we optimize the GP hyper parameters by doing a GP-specific variational inference (VI), +# where we optimize not the log marginal likelihood (ExactGP case), +# but an ELBO (evidence lower bound) objective +# +# $$ +# \max_\ve\zeta\left(\mathbb E_{q_{\ve\psi}(\ve u)}\left[\ln p(\ve y|\ve u) \right] - +# D_{\text{KL}}(q_{\ve\psi}(\ve u)\Vert p(\ve u))\right) +# $$ +# +# with respect to +# +# $$\ve\zeta = [\ell, \sigma_n^2, s, c, \ve\psi] $$ +# +# with +# +# $$\ve\psi = [\ve m_u, \ma Z, \ma L]$$ +# +# the parameters of the variational distribution $q_{\ve\psi}(\ve u)$. +# +# In addition, we perform a stochastic +# optimization by using a deep learning type mini-batch loop, hence +# "stochastic" variational inference (SVI). The latter speeds up the +# optimization since we only look at a fraction of data per optimizer step to +# calculate an approximate loss gradient (`loss.backward()`). + +# + +# Train mode +model.train() +likelihood.train() + +optimizer = torch.optim.Adam( + [dict(params=model.parameters()), dict(params=likelihood.parameters())], + lr=0.1, +) +loss_func = gpytorch.mlls.VariationalELBO( + likelihood, model, num_data=X_train.shape[0] +) + +train_dl = DataLoader( + TensorDataset(X_train, y_train), batch_size=128, shuffle=True +) + +n_iter = 200 +history = defaultdict(list) +for i_iter in range(n_iter): + for i_batch, (X_batch, y_batch) in enumerate(train_dl): + batch_history = defaultdict(list) + optimizer.zero_grad() + loss = -loss_func(model(X_batch), y_batch) + loss.backward() + optimizer.step() + param_dct = dict() + param_dct.update(extract_model_params(model, try_item=True)) + param_dct.update(extract_model_params(likelihood, try_item=True)) + for p_name, p_val in param_dct.items(): + if isinstance(p_val, float): + batch_history[p_name].append(p_val) + batch_history["loss"].append(loss.item()) + for p_name, p_lst in batch_history.items(): + history[p_name].append(np.mean(p_lst)) + if (i_iter + 1) % 10 == 0: + print(f"iter {i_iter + 1}/{n_iter}, {loss=:.3f}") +# - + +# Plot scalar hyper params and loss (ELBO) convergence +ncols = len(history) +fig, axs = plt.subplots( + ncols=ncols, nrows=1, figsize=(ncols * 3, 3), layout="compressed" +) +with torch.no_grad(): + for ax, (p_name, p_lst) in zip(axs, history.items()): + ax.plot(p_lst) + ax.set_title(p_name) + ax.set_xlabel("iterations") + +# Values of optimized hyper params +print("model params:") +pprint(extract_model_params(model)) +print("likelihood params:") +pprint(extract_model_params(likelihood)) + + +# # Run prediction + +# + +# Evaluation (predictive posterior) mode +model.eval() +likelihood.eval() + +with torch.no_grad(): + M = 10 + post_pred_f = model(X_pred) + post_pred_y = likelihood(model(X_pred)) + + fig, axs = plt.subplots(ncols=2, figsize=(14, 5), sharex=True, sharey=True) + fig_sigmas, ax_sigmas = plt.subplots() + for ii, (ax, post_pred, name, title) in enumerate( + zip( + axs, + [post_pred_f, post_pred_y], + ["f", "y"], + ["epistemic uncertainty", "total uncertainty"], + ) + ): + yf_mean = post_pred.mean + yf_samples = post_pred.sample(sample_shape=torch.Size((M,))) + + yf_std = post_pred.stddev + lower = yf_mean - 2 * yf_std + upper = yf_mean + 2 * yf_std + ax.plot( + X_train.numpy(), + y_train.numpy(), + "o", + label="data", + color="tab:blue", + ) + ax.plot( + X_pred.numpy(), + yf_mean.numpy(), + label="mean", + color="tab:red", + lw=2, + ) + ax.plot( + X_pred.numpy(), + y_gt_pred.numpy(), + label="ground truth", + color="k", + lw=2, + ls="--", + ) + ax.fill_between( + X_pred.numpy(), + lower.numpy(), + upper.numpy(), + label="confidence", + color="tab:orange", + alpha=0.3, + ) + ax.set_title(f"confidence = {title}") + if name == "f": + sigma_label = r"epistemic: $\pm 2\sqrt{\mathrm{diag}(\Sigma_*)}$" + zorder = 1 + else: + sigma_label = ( + r"total: $\pm 2\sqrt{\mathrm{diag}(\Sigma_* + \sigma_n^2\,I)}$" + ) + zorder = 0 + ax_sigmas.fill_between( + X_pred.numpy(), + lower.numpy(), + upper.numpy(), + label=sigma_label, + color="tab:orange" if name == "f" else "tab:blue", + alpha=0.5, + zorder=zorder, + ) + y_min = y_train.min() + y_max = y_train.max() + y_span = y_max - y_min + ax.set_ylim([y_min - 0.3 * y_span, y_max + 0.3 * y_span]) + plot_samples(ax, X_pred, yf_samples, label="posterior pred. samples") + if ii == 1: + ax.legend() + ax_sigmas.set_title("total vs. epistemic uncertainty") + ax_sigmas.legend() +# - + +# # Let's check the learned noise + +# + +# Target noise to learn +print("data noise:", noise_std) + +# The two below must be the same +print( + "learned noise:", + (post_pred_y.stddev**2 - post_pred_f.stddev**2).mean().sqrt().item(), +) +print( + "learned noise:", + np.sqrt( + extract_model_params(likelihood, try_item=True)["noise_covar.noise"] + ), +) +# - + +# When running as script +if not is_interactive(): + plt.show() diff --git a/BLcourse2.3/README.md b/BLcourse2.3/README.md index 21766d2c861fc568cce33e935241b908c3ce8938..28873335ed97e9dc83c448032e79fa6b25bc531d 100644 --- a/BLcourse2.3/README.md +++ b/BLcourse2.3/README.md @@ -8,4 +8,19 @@ $ jupyter-notebook notebook.ipynb ``` For convenience the ipynb file is included. Please keep it in sync with the py -file using jupytext. +file using `jupytext` (see `convert-to-ipynb.sh`). + +One-time setup of venv and ipy kernel: + +```sh +# or +# $ python -m venv --system-site-packages bayes-ml-course-sys +# $ source ./bayes-ml-course-sys/bin/activate +$ mkvirtualenv --system-site-packages bayes-ml-course-sys +$ pip install -r requirements.txt + +# Install custom kernel, select that in Jupyter. --sys-prefix installs into the +# current venv, while --user would install into ~/.local/share/jupyter/kernels/ +$ python -m ipykernel install --name bayes-ml-course --sys-prefix +Installed kernelspec bayes-ml-course in /home/elcorto/.virtualenvs/bayes-ml-course-sys/share/jupyter/kernels/bayes-ml-course +``` diff --git a/BLcourse2.3/convert-to-ipynb.sh b/BLcourse2.3/convert-to-ipynb.sh index be23e43b6e01e4ad115436765c007a8a12c78684..47d79cabe4e792141ee334421a2264913ce6afba 100755 --- a/BLcourse2.3/convert-to-ipynb.sh +++ b/BLcourse2.3/convert-to-ipynb.sh @@ -1,5 +1,10 @@ #!/bin/sh -for fn in 0*dim.py; do - jupytext --to ipynb $fn +# Remove plots etc, but keep random cell IDs. We need this b/c we use jupytext +# --update below. +nbstripout --keep-id *.ipynb + +for fn in $(ls -1 | grep -E '[0-9]+.+\.py'); do + # --update keeps call IDs -> smaller diffs + jupytext --to ipynb --update $fn done diff --git a/BLcourse2.3/requirements.txt b/BLcourse2.3/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..2a0a563ac5d2dd8a06ab84bdc9d5cb6ad6445ea6 --- /dev/null +++ b/BLcourse2.3/requirements.txt @@ -0,0 +1,39 @@ +# ---------------------------------------------------------------------------- +# Required to at least run the py scripts +# ---------------------------------------------------------------------------- +torch +gpytorch +pyro-ppl +matplotlib +scikit-learn + +# ---------------------------------------------------------------------------- +# Jupyter stuff, this must be present in the used Jupyter kernel on a hosted +# service like https://jupyter.jsc.fz-juelich.de +# ---------------------------------------------------------------------------- + +# Since we use paired notebooks. This is used for local dev, not really needed +# on a hosted service, but still useful to avoid confusing warnings (jupytext +# extension failed loading). +jupytext + +# for %matplotlib widget +ipympl + +# ---------------------------------------------------------------------------- +# Jupyter Lab, when running locally +# ---------------------------------------------------------------------------- + +# Jupyter Lab +jupyter + +# or Jupyter notebook is also fine +##notebook + +# ---------------------------------------------------------------------------- +# dev +# ---------------------------------------------------------------------------- + +nbstripout + +# vim:syn=sh diff --git a/BLcourse2.3/utils.py b/BLcourse2.3/utils.py index fc512d45c86aeb380087bd297aed0b3b29d84d7b..260be47d40d412b26d53a8a7db5e86c3e0996903 100644 --- a/BLcourse2.3/utils.py +++ b/BLcourse2.3/utils.py @@ -1,7 +1,8 @@ from matplotlib import pyplot as plt +import torch -def extract_model_params(model, raw=False) -> dict: +def extract_model_params(model, raw=False, try_item=False) -> dict: """Helper to convert model.named_parameters() to dict. With raw=True, use @@ -11,9 +12,25 @@ def extract_model_params(model, raw=False) -> dict: See https://docs.gpytorch.ai/en/stable/examples/00_Basic_Usage/Hyperparameters.html#Raw-vs-Actual-Parameters """ + if try_item: + + def get_val(p): + if isinstance(p, torch.Tensor): + if p.ndim == 0: + return p.item() + else: + p_sq = p.squeeze() + if (p_sq.ndim == 1 and len(p_sq) == 1) or p_sq.ndim == 0: + return p_sq.item() + else: + return p + else: + return p + else: + get_val = lambda p: p if raw: return dict( - (p_name, p_val.item()) + (p_name, get_val(p_val)) for p_name, p_val in model.named_parameters() ) else: @@ -24,7 +41,7 @@ def extract_model_params(model, raw=False) -> dict: # Yes, eval() hack. Sorry. p_name = p_name.replace(".raw_", ".") p_val = eval(f"model.{p_name}") - out[p_name] = p_val.item() + out[p_name] = get_val(p_val) return out diff --git a/slides/BLcourse2.3.pdf b/slides/BLcourse2.3.pdf index 9e1b7f8cb80b4521c9b003cfec3fd8288ac81d5f..a6ba5a21a17b725248be9e9481bd3dbfbe474474 100644 Binary files a/slides/BLcourse2.3.pdf and b/slides/BLcourse2.3.pdf differ