diff --git a/BLcourse2.3/01_one_dim.ipynb b/BLcourse2.3/01_one_dim.ipynb
index f5208de38e0c4b0cc1a1ed052d6d9393a318aa04..7c993e8f48af0f824f9bc92a6af1fde90023830b 100644
--- a/BLcourse2.3/01_one_dim.ipynb
+++ b/BLcourse2.3/01_one_dim.ipynb
@@ -3,16 +3,19 @@
   {
    "cell_type": "markdown",
    "id": "2a8ed8ae",
-   "metadata": {},
+   "metadata": {
+    "lines_to_next_cell": 2
+   },
    "source": [
     "# Notation\n",
     "$\\newcommand{\\ve}[1]{\\mathit{\\boldsymbol{#1}}}$\n",
     "$\\newcommand{\\ma}[1]{\\mathbf{#1}}$\n",
     "$\\newcommand{\\pred}[1]{\\rm{#1}}$\n",
     "$\\newcommand{\\predve}[1]{\\mathbf{#1}}$\n",
-    "$\\newcommand{\\cov}{\\mathrm{cov}}$\n",
     "$\\newcommand{\\test}[1]{#1_*}$\n",
     "$\\newcommand{\\testtest}[1]{#1_{**}}$\n",
+    "$\\DeclareMathOperator{\\diag}{diag}$\n",
+    "$\\DeclareMathOperator{\\cov}{cov}$\n",
     "\n",
     "Vector $\\ve a\\in\\mathbb R^n$ or $\\mathbb R^{n\\times 1}$, so \"column\" vector.\n",
     "Matrix $\\ma A\\in\\mathbb R^{n\\times m}$. Design matrix with input vectors $\\ve\n",
@@ -24,6 +27,19 @@
     "$\\ma X$ to keep the notation consistent with the slides."
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "8a7cc3c8",
+   "metadata": {},
+   "source": [
+    "# About\n",
+    "\n",
+    "In this notebook, we explore the material presented in [the\n",
+    "slides](https://doi.org/10.6084/m9.figshare.25988176) with code, using the\n",
+    "[gpytorch](https://gpytorch.ai) library. We cover exact GP inference and\n",
+    "hyper parameter optimization using the marginal likelihood."
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "ae440984",
@@ -40,8 +56,8 @@
    "outputs": [],
    "source": [
     "##%matplotlib notebook\n",
-    "##%matplotlib widget\n",
-    "%matplotlib inline"
+    "%matplotlib widget\n",
+    "##%matplotlib inline"
    ]
   },
   {
@@ -59,6 +75,7 @@
     "import gpytorch\n",
     "from matplotlib import pyplot as plt\n",
     "from matplotlib import is_interactive\n",
+    "import numpy as np\n",
     "\n",
     "from utils import extract_model_params, plot_samples\n",
     "\n",
@@ -103,26 +120,41 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def generate_data(x, gaps=[[1, 3]], const=5):\n",
-    "    y = torch.sin(x) * torch.exp(-0.2 * x) + torch.randn(x.shape) * 0.1 + const\n",
+    "def ground_truth(x, const):\n",
+    "    return torch.sin(x) * torch.exp(-0.2 * x) + const\n",
+    "\n",
+    "\n",
+    "def generate_data(x, gaps=[[1, 3]], const=None, noise_std=None):\n",
+    "    noise_dist = torch.distributions.Normal(loc=0, scale=noise_std)\n",
+    "    y = ground_truth(x, const=const) + noise_dist.sample(\n",
+    "        sample_shape=(len(x),)\n",
+    "    )\n",
     "    msk = torch.tensor([True] * len(x))\n",
     "    if gaps is not None:\n",
     "        for g in gaps:\n",
     "            msk = msk & ~((x > g[0]) & (x < g[1]))\n",
-    "    return x[msk], y[msk]\n",
+    "    return x[msk], y[msk], y\n",
     "\n",
     "\n",
+    "const = 5.0\n",
+    "noise_std = 0.1\n",
     "x = torch.linspace(0, 4 * math.pi, 100)\n",
-    "X_train, y_train = generate_data(x, gaps=[[6, 10]])\n",
+    "X_train, y_train, y_gt_train = generate_data(\n",
+    "    x, gaps=[[6, 10]], const=const, noise_std=noise_std\n",
+    ")\n",
     "X_pred = torch.linspace(\n",
     "    X_train[0] - 2, X_train[-1] + 2, 200, requires_grad=False\n",
     ")\n",
+    "y_gt_pred = ground_truth(X_pred, const=const)\n",
     "\n",
     "print(f\"{X_train.shape=}\")\n",
     "print(f\"{y_train.shape=}\")\n",
     "print(f\"{X_pred.shape=}\")\n",
     "\n",
-    "plt.scatter(X_train, y_train, marker=\"o\", color=\"tab:blue\")"
+    "fig, ax = plt.subplots()\n",
+    "ax.scatter(X_train, y_train, marker=\"o\", color=\"tab:blue\", label=\"noisy data\")\n",
+    "ax.plot(X_pred, y_gt_pred, ls=\"--\", color=\"k\", label=\"ground truth\")\n",
+    "ax.legend()"
    ]
   },
   {
@@ -145,7 +177,7 @@
     "`ScaleKernel`.\n",
     "\n",
     "In addition, we define a constant mean via `ConstantMean`. Finally we have\n",
-    "the likelihood noise $\\sigma_n^2$. So in total we have 4 hyper params.\n",
+    "the likelihood variance $\\sigma_n^2$. So in total we have 4 hyper params.\n",
     "\n",
     "* $\\ell$ = `model.covar_module.base_kernel.lengthscale`\n",
     "* $\\sigma_n^2$ = `model.likelihood.noise_covar.noise`\n",
@@ -209,7 +241,7 @@
    "outputs": [],
    "source": [
     "# Default start hyper params\n",
-    "pprint(extract_model_params(model, raw=False))"
+    "pprint(extract_model_params(model))"
    ]
   },
   {
@@ -225,9 +257,9 @@
     "model.mean_module.constant = 3.0\n",
     "model.covar_module.base_kernel.lengthscale = 1.0\n",
     "model.covar_module.outputscale = 1.0\n",
-    "model.likelihood.noise_covar.noise = 0.1\n",
+    "model.likelihood.noise_covar.noise = 1e-3\n",
     "\n",
-    "pprint(extract_model_params(model, raw=False))"
+    "pprint(extract_model_params(model))"
    ]
   },
   {
@@ -237,11 +269,16 @@
    "source": [
     "# Sample from the GP prior\n",
     "\n",
-    "We sample a number of functions $f_j, j=1,\\ldots,M$ from the GP prior and\n",
+    "We sample a number of functions $f_m, m=1,\\ldots,M$ from the GP prior and\n",
     "evaluate them at all $\\ma X$ = `X_pred` points, of which we have $N=200$. So\n",
-    "we effectively generate samples from $p(\\predve f|\\ma X) = \\mathcal N(\\ve\n",
-    "c, \\ma K)$. Each sampled vector $\\predve f\\in\\mathbb R^{N}$ and the\n",
-    "covariance (kernel) matrix is $\\ma K\\in\\mathbb R^{N\\times N}$."
+    "we effectively generate samples from `pri_f` =  $p(\\predve f|\\ma X) = \\mathcal N(\\ve\n",
+    "c, \\ma K)$. Each sampled vector $\\predve f\\in\\mathbb R^{N}$ represents a\n",
+    "sampled *function* $f$ evaluated the $N=200$ points in $\\ma X$. The\n",
+    "covariance (kernel) matrix is $\\ma K\\in\\mathbb R^{N\\times N}$. Its diagonal\n",
+    "$\\diag\\ma K$ = `f_std**2` represents the variance at each point on the $x$-axis.\n",
+    "This is what we plot as \"confidence band\" `f_mean` $\\pm$ `2 * f_std`.\n",
+    "The off-diagonal elements represent the correlation between different points\n",
+    "$K_{ij} = \\cov[f(\\ve x_i), f(\\ve x_j)]$."
    ]
   },
   {
@@ -286,11 +323,12 @@
    "id": "0598b567",
    "metadata": {},
    "source": [
-    "Let's investigate the samples more closely. A constant mean $\\ve m(\\ma X) =\n",
-    "\\ve c$ does *not* mean that each sampled vector $\\predve f$'s mean is\n",
-    "equal to $c$. Instead, we have that at each $\\ve x_i$, the mean of\n",
-    "*all* sampled functions is the same, so $\\frac{1}{M}\\sum_{j=1}^M f_j(\\ve x_i)\n",
-    "\\approx c$ and for $M\\rightarrow\\infty$ it will be exactly $c$.\n"
+    "Let's investigate the samples more closely. First we note that the samples\n",
+    "fluctuate around the mean `model.mean_module.constant` we defined above. A\n",
+    "constant mean $\\ve m(\\ma X) = \\ve c$ does *not* mean that each sampled vector\n",
+    "$\\predve f$'s mean is equal to $c$. Instead, we have that at each $\\ve x_i$,\n",
+    "the mean of *all* sampled functions is the same, so $\\frac{1}{M}\\sum_{j=1}^M\n",
+    "f_m(\\ve x_i) \\approx c$ and for $M\\rightarrow\\infty$ it will be exactly $c$.\n"
    ]
   },
   {
@@ -330,21 +368,20 @@
     "# GP posterior predictive distribution with fixed hyper params\n",
     "\n",
     "Now we calculate the posterior predictive distribution $p(\\test{\\predve\n",
-    "f}|\\test{\\ma X}, \\ma X, \\ve y)$, i.e. we condition on the train data (Bayesian\n",
+    "f}|\\test{\\ma X}, \\ma X, \\ve y) = \\mathcal N(\\test{\\ve\\mu}, \\test{\\ma\\Sigma})$,\n",
+    "i.e. we condition on the train data (Bayesian\n",
     "inference).\n",
     "\n",
     "We use the fixed hyper param values defined above. In particular, since\n",
     "$\\sigma_n^2$ = `model.likelihood.noise_covar.noise` is > 0, we have a\n",
-    "regression setting."
+    "regression setting -- the GP's mean doesn't interpolate all points."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "id": "d01951f1",
-   "metadata": {
-    "lines_to_next_cell": 2
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Evaluation (predictive posterior) mode\n",
@@ -375,6 +412,14 @@
     "        color=\"tab:red\",\n",
     "        lw=2,\n",
     "    )\n",
+    "    ax.plot(\n",
+    "        X_pred.numpy(),\n",
+    "        y_gt_pred.numpy(),\n",
+    "        label=\"ground truth\",\n",
+    "        color=\"k\",\n",
+    "        lw=2,\n",
+    "        ls=\"--\",\n",
+    "    )\n",
     "    ax.fill_between(\n",
     "        X_pred.numpy(),\n",
     "        lower.numpy(),\n",
@@ -395,20 +440,30 @@
    "cell_type": "markdown",
    "id": "b960a745",
    "metadata": {},
+   "source": [
+    "We observe that all sampled functions (green) and the mean (red) tend towards\n",
+    "the low fixed mean function $m(\\ve x)=c$ at 3.0 in the absence of data, while\n",
+    "the actual data mean is `const` from above (data generation). Also the other\n",
+    "hyper params ($\\ell$, $\\sigma_n^2$, $s$) are just guesses. Now we will\n",
+    "calculate their actual value by minimizing the negative log marginal\n",
+    "likelihood."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2de7f663",
+   "metadata": {},
    "source": [
     "# Fit GP to data: optimize hyper params\n",
     "\n",
     "In each step of the optimizer, we condition on the training data (e.g. do\n",
     "Bayesian inference) to calculate the posterior predictive distribution for\n",
-    "the current values of the hyper params. We iterate until the log marginal\n",
+    "the current values of the hyper params. We iterate until the negative log marginal\n",
     "likelihood is converged.\n",
     "\n",
     "We use a simplistic PyTorch-style hand written train loop without convergence\n",
     "control, so make sure to use enough `n_iter` and eyeball-check that the loss\n",
-    "is converged :-)\n",
-    "\n",
-    "Observe how all hyper params converge. In particular, note that the constant\n",
-    "mean $m(\\ve x)=c$ converges to the `const` value in `generate_data()`."
+    "is converged :-)"
    ]
   },
   {
@@ -433,8 +488,8 @@
     "    loss.backward()\n",
     "    optimizer.step()\n",
     "    if (ii + 1) % 10 == 0:\n",
-    "        print(f\"iter {ii+1}/{n_iter}, {loss=:.3f}\")\n",
-    "    for p_name, p_val in extract_model_params(model).items():\n",
+    "        print(f\"iter {ii + 1}/{n_iter}, {loss=:.3f}\")\n",
+    "    for p_name, p_val in extract_model_params(model, try_item=True).items():\n",
     "        history[p_name].append(p_val)\n",
     "    history[\"loss\"].append(loss.item())"
    ]
@@ -446,13 +501,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Plot hyper params and loss (neg. log marginal likelihood) convergence\n",
+    "# Plot hyper params and loss (negative log marginal likelihood) convergence\n",
     "ncols = len(history)\n",
-    "fig, axs = plt.subplots(ncols=ncols, nrows=1, figsize=(ncols * 5, 5))\n",
-    "for ax, (p_name, p_lst) in zip(axs, history.items()):\n",
-    "    ax.plot(p_lst)\n",
-    "    ax.set_title(p_name)\n",
-    "    ax.set_xlabel(\"iterations\")"
+    "fig, axs = plt.subplots(\n",
+    "    ncols=ncols, nrows=1, figsize=(ncols * 3, 3), layout=\"compressed\"\n",
+    ")\n",
+    "with torch.no_grad():\n",
+    "    for ax, (p_name, p_lst) in zip(axs, history.items()):\n",
+    "        ax.plot(p_lst)\n",
+    "        ax.set_title(p_name)\n",
+    "        ax.set_xlabel(\"iterations\")"
    ]
   },
   {
@@ -463,27 +521,33 @@
    "outputs": [],
    "source": [
     "# Values of optimized hyper params\n",
-    "pprint(extract_model_params(model, raw=False))"
+    "pprint(extract_model_params(model))"
    ]
   },
   {
    "cell_type": "markdown",
    "id": "98aefb90",
    "metadata": {},
+   "source": [
+    "We see that all hyper params converge. In particular, note that the constant\n",
+    "mean $m(\\ve x)=c$ converges to the `const` value in `generate_data()`."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b2f6afb0",
+   "metadata": {},
    "source": [
     "# Run prediction\n",
     "\n",
-    "We show \"noiseless\" (left: $\\sigma = \\sqrt{\\mathrm{diag}(\\ma\\Sigma)}$) vs.\n",
-    "\"noisy\" (right: $\\sigma = \\sqrt{\\mathrm{diag}(\\ma\\Sigma + \\sigma_n^2\\,\\ma\n",
-    "I_N)}$) predictions with\n",
-    "\n",
-    "$$\\ma\\Sigma = \\testtest{\\ma K} - \\test{\\ma K}\\,(\\ma K+\\sigma_n^2\\,\\ma I)^{-1}\\,\\test{\\ma K}^\\top$$\n",
+    "We run prediction with two variants of the posterior predictive distribution:\n",
+    "using either only the epistemic uncertainty or using the total uncertainty.\n",
     "\n",
-    "We find that $\\ma\\Sigma$ reflects behavior we would like to see from\n",
-    "epistemic uncertainty -- it is high when we have no data\n",
-    "(out-of-distribution). But this alone isn't the whole story. We need to add\n",
-    "the estimated noise level $\\sigma_n^2$ in order for the confidence band to\n",
-    "cover the data."
+    "* epistemic: $p(\\test{\\predve f}|\\test{\\ma X}, \\ma X, \\ve y) =\n",
+    "  \\mathcal N(\\test{\\ve\\mu}, \\test{\\ma\\Sigma})$ = `post_pred_f` with\n",
+    "  $\\test{\\ma\\Sigma} = \\testtest{\\ma K} - \\test{\\ma K}\\,(\\ma K+\\sigma_n^2\\,\\ma I)^{-1}\\,\\test{\\ma K}^\\top$\n",
+    "* total: $p(\\test{\\predve y}|\\test{\\ma X}, \\ma X, \\ve y) =\n",
+    "  \\mathcal N(\\test{\\ve\\mu}, \\test{\\ma\\Sigma} + \\sigma_n^2\\,\\ma I_N))$ = `post_pred_y`"
    ]
   },
   {
@@ -502,10 +566,15 @@
     "    post_pred_f = model(X_pred)\n",
     "    post_pred_y = likelihood(model(X_pred))\n",
     "\n",
-    "    fig, axs = plt.subplots(ncols=2, figsize=(12, 5))\n",
+    "    fig, axs = plt.subplots(ncols=2, figsize=(14, 5), sharex=True, sharey=True)\n",
     "    fig_sigmas, ax_sigmas = plt.subplots()\n",
-    "    for ii, (ax, post_pred, name) in enumerate(\n",
-    "        zip(axs, [post_pred_f, post_pred_y], [\"f\", \"y\"])\n",
+    "    for ii, (ax, post_pred, name, title) in enumerate(\n",
+    "        zip(\n",
+    "            axs,\n",
+    "            [post_pred_f, post_pred_y],\n",
+    "            [\"f\", \"y\"],\n",
+    "            [\"epistemic uncertainty\", \"total uncertainty\"],\n",
+    "        )\n",
     "    ):\n",
     "        yf_mean = post_pred.mean\n",
     "        yf_samples = post_pred.sample(sample_shape=torch.Size((M,)))\n",
@@ -527,6 +596,14 @@
     "            color=\"tab:red\",\n",
     "            lw=2,\n",
     "        )\n",
+    "        ax.plot(\n",
+    "            X_pred.numpy(),\n",
+    "            y_gt_pred.numpy(),\n",
+    "            label=\"ground truth\",\n",
+    "            color=\"k\",\n",
+    "            lw=2,\n",
+    "            ls=\"--\",\n",
+    "        )\n",
     "        ax.fill_between(\n",
     "            X_pred.numpy(),\n",
     "            lower.numpy(),\n",
@@ -535,19 +612,20 @@
     "            color=\"tab:orange\",\n",
     "            alpha=0.3,\n",
     "        )\n",
+    "        ax.set_title(f\"confidence = {title}\")\n",
     "        if name == \"f\":\n",
-    "            sigma_label = r\"$\\pm 2\\sqrt{\\mathrm{diag}(\\Sigma)}$\"\n",
+    "            sigma_label = r\"epistemic: $\\pm 2\\sqrt{\\mathrm{diag}(\\Sigma_*)}$\"\n",
     "            zorder = 1\n",
     "        else:\n",
     "            sigma_label = (\n",
-    "                r\"$\\pm 2\\sqrt{\\mathrm{diag}(\\Sigma + \\sigma_n^2\\,I)}$\"\n",
+    "                r\"total: $\\pm 2\\sqrt{\\mathrm{diag}(\\Sigma_* + \\sigma_n^2\\,I)}$\"\n",
     "            )\n",
     "            zorder = 0\n",
     "        ax_sigmas.fill_between(\n",
     "            X_pred.numpy(),\n",
     "            lower.numpy(),\n",
     "            upper.numpy(),\n",
-    "            label=\"confidence \" + sigma_label,\n",
+    "            label=sigma_label,\n",
     "            color=\"tab:orange\" if name == \"f\" else \"tab:blue\",\n",
     "            alpha=0.5,\n",
     "            zorder=zorder,\n",
@@ -559,9 +637,62 @@
     "        plot_samples(ax, X_pred, yf_samples, label=\"posterior pred. samples\")\n",
     "        if ii == 1:\n",
     "            ax.legend()\n",
+    "    ax_sigmas.set_title(\"total vs. epistemic uncertainty\")\n",
     "    ax_sigmas.legend()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "4c52a919",
+   "metadata": {},
+   "source": [
+    "We find that $\\test{\\ma\\Sigma}$ reflects behavior we would like to see from\n",
+    "epistemic uncertainty -- it is high when we have no data\n",
+    "(out-of-distribution). But this alone isn't the whole story. We need to add\n",
+    "the estimated likelihood variance $\\sigma_n^2$ in order for the confidence\n",
+    "band to cover the data -- this turns the estimated total uncertainty into a\n",
+    "*calibrated* uncertainty."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8823deb0",
+   "metadata": {},
+   "source": [
+    "# Let's check the learned noise\n",
+    "\n",
+    "We compare the target data noise (`noise_std`) to the learned GP noise, in\n",
+    "the form of the likelihood *standard deviation* $\\sigma_n$. The latter is\n",
+    "equal to the $\\sqrt{\\cdot}$ of `likelihood.noise_covar.noise` and can also be\n",
+    "calculated via $\\sqrt{\\diag(\\test{\\ma\\Sigma} + \\sigma_n^2\\,\\ma I_N) -\n",
+    "\\diag(\\test{\\ma\\Sigma}})$."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d7cd1c02",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Target noise to learn\n",
+    "print(\"data noise:\", noise_std)\n",
+    "\n",
+    "# The two below must be the same\n",
+    "print(\n",
+    "    \"learned noise:\",\n",
+    "    (post_pred_y.stddev**2 - post_pred_f.stddev**2).mean().sqrt().item(),\n",
+    ")\n",
+    "print(\n",
+    "    \"learned noise:\",\n",
+    "    np.sqrt(\n",
+    "        extract_model_params(model, try_item=True)[\n",
+    "            \"likelihood.noise_covar.noise\"\n",
+    "        ]\n",
+    "    ),\n",
+    ")"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -580,9 +711,21 @@
    "formats": "ipynb,py:light"
   },
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "bayes-ml-course",
    "language": "python",
-   "name": "python3"
+   "name": "bayes-ml-course"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.3"
   }
  },
  "nbformat": 4,
diff --git a/BLcourse2.3/01_one_dim.py b/BLcourse2.3/01_one_dim.py
index e9e55f233fc306eb2da38ca98ce710d5a359daa0..f29e5bbd6556259f674c9c8fcf7b3935461ddc94 100644
--- a/BLcourse2.3/01_one_dim.py
+++ b/BLcourse2.3/01_one_dim.py
@@ -6,11 +6,11 @@
 #       extension: .py
 #       format_name: light
 #       format_version: '1.5'
-#       jupytext_version: 1.16.2
+#       jupytext_version: 1.17.1
 #   kernelspec:
-#     display_name: Python 3 (ipykernel)
+#     display_name: bayes-ml-course
 #     language: python
-#     name: python3
+#     name: bayes-ml-course
 # ---
 
 # # Notation
@@ -18,9 +18,10 @@
 # $\newcommand{\ma}[1]{\mathbf{#1}}$
 # $\newcommand{\pred}[1]{\rm{#1}}$
 # $\newcommand{\predve}[1]{\mathbf{#1}}$
-# $\newcommand{\cov}{\mathrm{cov}}$
 # $\newcommand{\test}[1]{#1_*}$
 # $\newcommand{\testtest}[1]{#1_{**}}$
+# $\DeclareMathOperator{\diag}{diag}$
+# $\DeclareMathOperator{\cov}{cov}$
 #
 # Vector $\ve a\in\mathbb R^n$ or $\mathbb R^{n\times 1}$, so "column" vector.
 # Matrix $\ma A\in\mathbb R^{n\times m}$. Design matrix with input vectors $\ve
@@ -31,11 +32,19 @@
 # we still denote the collection of all $\ve x_i = x_i\in\mathbb R$ points with
 # $\ma X$ to keep the notation consistent with the slides.
 
+
+# # About
+#
+# In this notebook, we explore the material presented in [the
+# slides](https://doi.org/10.6084/m9.figshare.25988176) with code, using the
+# [gpytorch](https://gpytorch.ai) library. We cover exact GP inference and
+# hyper parameter optimization using the marginal likelihood.
+
 # # Imports, helpers, setup
 
 # ##%matplotlib notebook
-# ##%matplotlib widget
-# %matplotlib inline
+# %matplotlib widget
+# ##%matplotlib inline
 
 # +
 import math
@@ -46,6 +55,7 @@ import torch
 import gpytorch
 from matplotlib import pyplot as plt
 from matplotlib import is_interactive
+import numpy as np
 
 from utils import extract_model_params, plot_samples
 
@@ -77,26 +87,41 @@ torch.manual_seed(123)
 
 
 # +
-def generate_data(x, gaps=[[1, 3]], const=5):
-    y = torch.sin(x) * torch.exp(-0.2 * x) + torch.randn(x.shape) * 0.1 + const
+def ground_truth(x, const):
+    return torch.sin(x) * torch.exp(-0.2 * x) + const
+
+
+def generate_data(x, gaps=[[1, 3]], const=None, noise_std=None):
+    noise_dist = torch.distributions.Normal(loc=0, scale=noise_std)
+    y = ground_truth(x, const=const) + noise_dist.sample(
+        sample_shape=(len(x),)
+    )
     msk = torch.tensor([True] * len(x))
     if gaps is not None:
         for g in gaps:
             msk = msk & ~((x > g[0]) & (x < g[1]))
-    return x[msk], y[msk]
+    return x[msk], y[msk], y
 
 
+const = 5.0
+noise_std = 0.1
 x = torch.linspace(0, 4 * math.pi, 100)
-X_train, y_train = generate_data(x, gaps=[[6, 10]])
+X_train, y_train, y_gt_train = generate_data(
+    x, gaps=[[6, 10]], const=const, noise_std=noise_std
+)
 X_pred = torch.linspace(
     X_train[0] - 2, X_train[-1] + 2, 200, requires_grad=False
 )
+y_gt_pred = ground_truth(X_pred, const=const)
 
 print(f"{X_train.shape=}")
 print(f"{y_train.shape=}")
 print(f"{X_pred.shape=}")
 
-plt.scatter(X_train, y_train, marker="o", color="tab:blue")
+fig, ax = plt.subplots()
+ax.scatter(X_train, y_train, marker="o", color="tab:blue", label="noisy data")
+ax.plot(X_pred, y_gt_pred, ls="--", color="k", label="ground truth")
+ax.legend()
 # -
 
 # # Define GP model
@@ -112,7 +137,7 @@ plt.scatter(X_train, y_train, marker="o", color="tab:blue")
 # `ScaleKernel`.
 #
 # In addition, we define a constant mean via `ConstantMean`. Finally we have
-# the likelihood noise $\sigma_n^2$. So in total we have 4 hyper params.
+# the likelihood variance $\sigma_n^2$. So in total we have 4 hyper params.
 #
 # * $\ell$ = `model.covar_module.base_kernel.lengthscale`
 # * $\sigma_n^2$ = `model.likelihood.noise_covar.noise`
@@ -154,26 +179,31 @@ model = ExactGPModel(X_train, y_train, likelihood)
 print(model)
 
 # Default start hyper params
-pprint(extract_model_params(model, raw=False))
+pprint(extract_model_params(model))
 
 # +
 # Set new start hyper params
 model.mean_module.constant = 3.0
 model.covar_module.base_kernel.lengthscale = 1.0
 model.covar_module.outputscale = 1.0
-model.likelihood.noise_covar.noise = 0.1
+model.likelihood.noise_covar.noise = 1e-3
 
-pprint(extract_model_params(model, raw=False))
+pprint(extract_model_params(model))
 # -
 
 
 # # Sample from the GP prior
 #
-# We sample a number of functions $f_j, j=1,\ldots,M$ from the GP prior and
+# We sample a number of functions $f_m, m=1,\ldots,M$ from the GP prior and
 # evaluate them at all $\ma X$ = `X_pred` points, of which we have $N=200$. So
-# we effectively generate samples from $p(\predve f|\ma X) = \mathcal N(\ve
-# c, \ma K)$. Each sampled vector $\predve f\in\mathbb R^{N}$ and the
-# covariance (kernel) matrix is $\ma K\in\mathbb R^{N\times N}$.
+# we effectively generate samples from `pri_f` =  $p(\predve f|\ma X) = \mathcal N(\ve
+# c, \ma K)$. Each sampled vector $\predve f\in\mathbb R^{N}$ represents a
+# sampled *function* $f$ evaluated the $N=200$ points in $\ma X$. The
+# covariance (kernel) matrix is $\ma K\in\mathbb R^{N\times N}$. Its diagonal
+# $\diag\ma K$ = `f_std**2` represents the variance at each point on the $x$-axis.
+# This is what we plot as "confidence band" `f_mean` $\pm$ `2 * f_std`.
+# The off-diagonal elements represent the correlation between different points
+# $K_{ij} = \cov[f(\ve x_i), f(\ve x_j)]$.
 
 # +
 model.eval()
@@ -205,11 +235,12 @@ with torch.no_grad():
 # -
 
 
-# Let's investigate the samples more closely. A constant mean $\ve m(\ma X) =
-# \ve c$ does *not* mean that each sampled vector $\predve f$'s mean is
-# equal to $c$. Instead, we have that at each $\ve x_i$, the mean of
-# *all* sampled functions is the same, so $\frac{1}{M}\sum_{j=1}^M f_j(\ve x_i)
-# \approx c$ and for $M\rightarrow\infty$ it will be exactly $c$.
+# Let's investigate the samples more closely. First we note that the samples
+# fluctuate around the mean `model.mean_module.constant` we defined above. A
+# constant mean $\ve m(\ma X) = \ve c$ does *not* mean that each sampled vector
+# $\predve f$'s mean is equal to $c$. Instead, we have that at each $\ve x_i$,
+# the mean of *all* sampled functions is the same, so $\frac{1}{M}\sum_{j=1}^M
+# f_m(\ve x_i) \approx c$ and for $M\rightarrow\infty$ it will be exactly $c$.
 #
 
 # Look at the first 20 x points from M=10 samples
@@ -228,12 +259,13 @@ print(f"{f_samples.mean(axis=0).std()=}")
 # # GP posterior predictive distribution with fixed hyper params
 #
 # Now we calculate the posterior predictive distribution $p(\test{\predve
-# f}|\test{\ma X}, \ma X, \ve y)$, i.e. we condition on the train data (Bayesian
+# f}|\test{\ma X}, \ma X, \ve y) = \mathcal N(\test{\ve\mu}, \test{\ma\Sigma})$,
+# i.e. we condition on the train data (Bayesian
 # inference).
 #
 # We use the fixed hyper param values defined above. In particular, since
 # $\sigma_n^2$ = `model.likelihood.noise_covar.noise` is > 0, we have a
-# regression setting.
+# regression setting -- the GP's mean doesn't interpolate all points.
 
 # +
 # Evaluation (predictive posterior) mode
@@ -264,6 +296,14 @@ with torch.no_grad():
         color="tab:red",
         lw=2,
     )
+    ax.plot(
+        X_pred.numpy(),
+        y_gt_pred.numpy(),
+        label="ground truth",
+        color="k",
+        lw=2,
+        ls="--",
+    )
     ax.fill_between(
         X_pred.numpy(),
         lower.numpy(),
@@ -280,20 +320,23 @@ with torch.no_grad():
     ax.legend()
 # -
 
+# We observe that all sampled functions (green) and the mean (red) tend towards
+# the low fixed mean function $m(\ve x)=c$ at 3.0 in the absence of data, while
+# the actual data mean is `const` from above (data generation). Also the other
+# hyper params ($\ell$, $\sigma_n^2$, $s$) are just guesses. Now we will
+# calculate their actual value by minimizing the negative log marginal
+# likelihood.
 
 # # Fit GP to data: optimize hyper params
 #
 # In each step of the optimizer, we condition on the training data (e.g. do
 # Bayesian inference) to calculate the posterior predictive distribution for
-# the current values of the hyper params. We iterate until the log marginal
+# the current values of the hyper params. We iterate until the negative log marginal
 # likelihood is converged.
 #
 # We use a simplistic PyTorch-style hand written train loop without convergence
 # control, so make sure to use enough `n_iter` and eyeball-check that the loss
 # is converged :-)
-#
-# Observe how all hyper params converge. In particular, note that the constant
-# mean $m(\ve x)=c$ converges to the `const` value in `generate_data()`.
 
 # +
 # Train mode
@@ -311,36 +354,39 @@ for ii in range(n_iter):
     loss.backward()
     optimizer.step()
     if (ii + 1) % 10 == 0:
-        print(f"iter {ii+1}/{n_iter}, {loss=:.3f}")
-    for p_name, p_val in extract_model_params(model).items():
+        print(f"iter {ii + 1}/{n_iter}, {loss=:.3f}")
+    for p_name, p_val in extract_model_params(model, try_item=True).items():
         history[p_name].append(p_val)
     history["loss"].append(loss.item())
 # -
 
-# Plot hyper params and loss (neg. log marginal likelihood) convergence
+# Plot hyper params and loss (negative log marginal likelihood) convergence
 ncols = len(history)
-fig, axs = plt.subplots(ncols=ncols, nrows=1, figsize=(ncols * 5, 5))
-for ax, (p_name, p_lst) in zip(axs, history.items()):
-    ax.plot(p_lst)
-    ax.set_title(p_name)
-    ax.set_xlabel("iterations")
+fig, axs = plt.subplots(
+    ncols=ncols, nrows=1, figsize=(ncols * 3, 3), layout="compressed"
+)
+with torch.no_grad():
+    for ax, (p_name, p_lst) in zip(axs, history.items()):
+        ax.plot(p_lst)
+        ax.set_title(p_name)
+        ax.set_xlabel("iterations")
 
 # Values of optimized hyper params
-pprint(extract_model_params(model, raw=False))
+pprint(extract_model_params(model))
+
+# We see that all hyper params converge. In particular, note that the constant
+# mean $m(\ve x)=c$ converges to the `const` value in `generate_data()`.
 
 # # Run prediction
 #
-# We show "noiseless" (left: $\sigma = \sqrt{\mathrm{diag}(\ma\Sigma)}$) vs.
-# "noisy" (right: $\sigma = \sqrt{\mathrm{diag}(\ma\Sigma + \sigma_n^2\,\ma
-# I_N)}$) predictions with
-#
-# $$\ma\Sigma = \testtest{\ma K} - \test{\ma K}\,(\ma K+\sigma_n^2\,\ma I)^{-1}\,\test{\ma K}^\top$$
+# We run prediction with two variants of the posterior predictive distribution:
+# using either only the epistemic uncertainty or using the total uncertainty.
 #
-# We find that $\ma\Sigma$ reflects behavior we would like to see from
-# epistemic uncertainty -- it is high when we have no data
-# (out-of-distribution). But this alone isn't the whole story. We need to add
-# the estimated noise level $\sigma_n^2$ in order for the confidence band to
-# cover the data.
+# * epistemic: $p(\test{\predve f}|\test{\ma X}, \ma X, \ve y) =
+#   \mathcal N(\test{\ve\mu}, \test{\ma\Sigma})$ = `post_pred_f` with
+#   $\test{\ma\Sigma} = \testtest{\ma K} - \test{\ma K}\,(\ma K+\sigma_n^2\,\ma I)^{-1}\,\test{\ma K}^\top$
+# * total: $p(\test{\predve y}|\test{\ma X}, \ma X, \ve y) =
+#   \mathcal N(\test{\ve\mu}, \test{\ma\Sigma} + \sigma_n^2\,\ma I_N))$ = `post_pred_y`
 
 # +
 # Evaluation (predictive posterior) mode
@@ -352,10 +398,15 @@ with torch.no_grad():
     post_pred_f = model(X_pred)
     post_pred_y = likelihood(model(X_pred))
 
-    fig, axs = plt.subplots(ncols=2, figsize=(12, 5))
+    fig, axs = plt.subplots(ncols=2, figsize=(14, 5), sharex=True, sharey=True)
     fig_sigmas, ax_sigmas = plt.subplots()
-    for ii, (ax, post_pred, name) in enumerate(
-        zip(axs, [post_pred_f, post_pred_y], ["f", "y"])
+    for ii, (ax, post_pred, name, title) in enumerate(
+        zip(
+            axs,
+            [post_pred_f, post_pred_y],
+            ["f", "y"],
+            ["epistemic uncertainty", "total uncertainty"],
+        )
     ):
         yf_mean = post_pred.mean
         yf_samples = post_pred.sample(sample_shape=torch.Size((M,)))
@@ -377,6 +428,14 @@ with torch.no_grad():
             color="tab:red",
             lw=2,
         )
+        ax.plot(
+            X_pred.numpy(),
+            y_gt_pred.numpy(),
+            label="ground truth",
+            color="k",
+            lw=2,
+            ls="--",
+        )
         ax.fill_between(
             X_pred.numpy(),
             lower.numpy(),
@@ -385,19 +444,20 @@ with torch.no_grad():
             color="tab:orange",
             alpha=0.3,
         )
+        ax.set_title(f"confidence = {title}")
         if name == "f":
-            sigma_label = r"$\pm 2\sqrt{\mathrm{diag}(\Sigma)}$"
+            sigma_label = r"epistemic: $\pm 2\sqrt{\mathrm{diag}(\Sigma_*)}$"
             zorder = 1
         else:
             sigma_label = (
-                r"$\pm 2\sqrt{\mathrm{diag}(\Sigma + \sigma_n^2\,I)}$"
+                r"total: $\pm 2\sqrt{\mathrm{diag}(\Sigma_* + \sigma_n^2\,I)}$"
             )
             zorder = 0
         ax_sigmas.fill_between(
             X_pred.numpy(),
             lower.numpy(),
             upper.numpy(),
-            label="confidence " + sigma_label,
+            label=sigma_label,
             color="tab:orange" if name == "f" else "tab:blue",
             alpha=0.5,
             zorder=zorder,
@@ -409,11 +469,44 @@ with torch.no_grad():
         plot_samples(ax, X_pred, yf_samples, label="posterior pred. samples")
         if ii == 1:
             ax.legend()
+    ax_sigmas.set_title("total vs. epistemic uncertainty")
     ax_sigmas.legend()
 # -
 
+# We find that $\test{\ma\Sigma}$ reflects behavior we would like to see from
+# epistemic uncertainty -- it is high when we have no data
+# (out-of-distribution). But this alone isn't the whole story. We need to add
+# the estimated likelihood variance $\sigma_n^2$ in order for the confidence
+# band to cover the data -- this turns the estimated total uncertainty into a
+# *calibrated* uncertainty.
+
+# # Let's check the learned noise
+#
+# We compare the target data noise (`noise_std`) to the learned GP noise, in
+# the form of the likelihood *standard deviation* $\sigma_n$. The latter is
+# equal to the $\sqrt{\cdot}$ of `likelihood.noise_covar.noise` and can also be
+# calculated via $\sqrt{\diag(\test{\ma\Sigma} + \sigma_n^2\,\ma I_N) -
+# \diag(\test{\ma\Sigma}})$.
+
 # +
+# Target noise to learn
+print("data noise:", noise_std)
+
+# The two below must be the same
+print(
+    "learned noise:",
+    (post_pred_y.stddev**2 - post_pred_f.stddev**2).mean().sqrt().item(),
+)
+print(
+    "learned noise:",
+    np.sqrt(
+        extract_model_params(model, try_item=True)[
+            "likelihood.noise_covar.noise"
+        ]
+    ),
+)
+# -
+
 # When running as script
 if not is_interactive():
     plt.show()
-# -
diff --git a/BLcourse2.3/02_two_dim.ipynb b/BLcourse2.3/02_two_dim.ipynb
index a87daa957597ef0730b108251a6f5c459c7c8eec..9ae348f938d2f337bd74ce53ed79520695bb6fc7 100644
--- a/BLcourse2.3/02_two_dim.ipynb
+++ b/BLcourse2.3/02_two_dim.ipynb
@@ -1,5 +1,27 @@
 {
  "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "8c19b309",
+   "metadata": {},
+   "source": [
+    "# About\n",
+    "\n",
+    "In this notebook, we use a GP to fit a 2D data set. We use the same ExactGP\n",
+    "machinery as in the 1D case and show how GPs can be used for 2D interpolation\n",
+    "(when data is free of noise) or regression (noisy data). Think of this as a\n",
+    "toy geospatial data setting. Actually, in geostatistics, Gaussian process\n",
+    "regression is known as [Kriging](https://en.wikipedia.org/wiki/Kriging).\n",
+    "$\\newcommand{\\ve}[1]{\\mathit{\\boldsymbol{#1}}}$\n",
+    "$\\newcommand{\\ma}[1]{\\mathbf{#1}}$\n",
+    "$\\newcommand{\\pred}[1]{\\rm{#1}}$\n",
+    "$\\newcommand{\\predve}[1]{\\mathbf{#1}}$\n",
+    "$\\newcommand{\\test}[1]{#1_*}$\n",
+    "$\\newcommand{\\testtest}[1]{#1_{**}}$\n",
+    "$\\DeclareMathOperator{\\diag}{diag}$\n",
+    "$\\DeclareMathOperator{\\cov}{cov}$"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -7,8 +29,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%matplotlib notebook\n",
-    "##%matplotlib widget\n",
+    "##%matplotlib notebook\n",
+    "%matplotlib widget\n",
     "##%matplotlib inline"
    ]
   },
@@ -53,7 +75,14 @@
     "lines_to_next_cell": 2
    },
    "source": [
-    "# Generate toy 2D data"
+    "# Generate toy 2D data\n",
+    "\n",
+    "Our ground truth function is $f(\\ve x) = \\sin(r) / r$ with $\\ve x =\n",
+    "[x_0,x_1] \\in\\mathbb R^2$ and the radial distance\n",
+    "$r=\\sqrt{\\ve x^\\top\\,\\ve x}$, also known as \"Mexican hat\" function, which is a\n",
+    "radial wave-like pattern which decays with distance from the center $\\ve\n",
+    "x=\\ve 0$. We generate data by random sampling 2D points $\\ve x_i$ and calculating\n",
+    "$y_i = f(\\ve x_i)$, optionally adding Gaussian noise further down."
    ]
   },
   {
@@ -146,6 +175,14 @@
     "# Exercise 1"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "c860aa4f",
+   "metadata": {},
+   "source": [
+    "Keep the settings below and explore the notebook till the end first."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -165,15 +202,23 @@
     "# Exercise 2"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "9f0c3b31",
+   "metadata": {},
+   "source": [
+    "First complete the notebook as is, then come back here."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "6bec0f58",
+   "id": "5132eaeb",
    "metadata": {},
    "outputs": [],
    "source": [
-    "##use_noise = True\n",
-    "##use_gap = False"
+    "##use_noise = False\n",
+    "##use_gap = True"
    ]
   },
   {
@@ -184,21 +229,29 @@
     "# Exercise 3"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "bfd5d87b",
+   "metadata": {},
+   "source": [
+    "First complete the notebook with Exercise 2, then come back here."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "5132eaeb",
+   "id": "f27dce89",
    "metadata": {},
    "outputs": [],
    "source": [
-    "##use_noise = False\n",
+    "##use_noise = True\n",
     "##use_gap = True"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "6efe09b3",
+   "id": "cd8ac983",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -206,7 +259,9 @@
     "    # noisy train data\n",
     "    noise_std = 0.2\n",
     "    noise_dist = torch.distributions.Normal(loc=0, scale=noise_std)\n",
-    "    y_train = data_train.z + noise_dist.sample_n(len(data_train.z))\n",
+    "    y_train = data_train.z + noise_dist.sample(\n",
+    "        sample_shape=(len(data_train.z),)\n",
+    "    )\n",
     "else:\n",
     "    # noise-free train data\n",
     "    noise_std = 0\n",
@@ -216,11 +271,12 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "af637911",
+   "id": "59f6f6d6",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Cut out part of the train data to create out-of-distribution predictions\n",
+    "# Cut out part of the train data to create out-of-distribution predictions.\n",
+    "# Same as the \"gaps\" we created in the 1D case.\n",
     "\n",
     "if use_gap:\n",
     "    mask = (X_train[:, 0] > 0) & (X_train[:, 1] < 0)\n",
@@ -254,6 +310,15 @@
     "ax.set_ylabel(\"X_1\")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "b7ff9e6e",
+   "metadata": {},
+   "source": [
+    "The gray surface is the ground truth function. The blue points are the\n",
+    "training data."
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "a00bf4e4",
@@ -318,7 +383,7 @@
    "outputs": [],
    "source": [
     "# Default start hyper params\n",
-    "pprint(extract_model_params(model, raw=False))"
+    "pprint(extract_model_params(model))"
    ]
   },
   {
@@ -334,7 +399,7 @@
     "model.covar_module.outputscale = 8.0\n",
     "model.likelihood.noise_covar.noise = 0.1\n",
     "\n",
-    "pprint(extract_model_params(model, raw=False))"
+    "pprint(extract_model_params(model))"
    ]
   },
   {
@@ -356,10 +421,10 @@
     "model.train()\n",
     "likelihood.train()\n",
     "\n",
-    "optimizer = torch.optim.Adam(model.parameters(), lr=0.2)\n",
+    "optimizer = torch.optim.Adam(model.parameters(), lr=0.15)\n",
     "loss_func = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)\n",
     "\n",
-    "n_iter = 300\n",
+    "n_iter = 400\n",
     "history = defaultdict(list)\n",
     "for ii in range(n_iter):\n",
     "    optimizer.zero_grad()\n",
@@ -367,8 +432,8 @@
     "    loss.backward()\n",
     "    optimizer.step()\n",
     "    if (ii + 1) % 10 == 0:\n",
-    "        print(f\"iter {ii+1}/{n_iter}, {loss=:.3f}\")\n",
-    "    for p_name, p_val in extract_model_params(model).items():\n",
+    "        print(f\"iter {ii + 1}/{n_iter}, {loss=:.3f}\")\n",
+    "    for p_name, p_val in extract_model_params(model, try_item=True).items():\n",
     "        history[p_name].append(p_val)\n",
     "    history[\"loss\"].append(loss.item())"
    ]
@@ -381,11 +446,14 @@
    "outputs": [],
    "source": [
     "ncols = len(history)\n",
-    "fig, axs = plt.subplots(ncols=ncols, nrows=1, figsize=(ncols * 5, 5))\n",
-    "for ax, (p_name, p_lst) in zip(axs, history.items()):\n",
-    "    ax.plot(p_lst)\n",
-    "    ax.set_title(p_name)\n",
-    "    ax.set_xlabel(\"iterations\")"
+    "fig, axs = plt.subplots(\n",
+    "    ncols=ncols, nrows=1, figsize=(ncols * 3, 3), layout=\"compressed\"\n",
+    ")\n",
+    "with torch.no_grad():\n",
+    "    for ax, (p_name, p_lst) in zip(axs, history.items()):\n",
+    "        ax.plot(p_lst)\n",
+    "        ax.set_title(p_name)\n",
+    "        ax.set_xlabel(\"iterations\")"
    ]
   },
   {
@@ -396,7 +464,7 @@
    "outputs": [],
    "source": [
     "# Values of optimized hyper params\n",
-    "pprint(extract_model_params(model, raw=False))"
+    "pprint(extract_model_params(model))"
    ]
   },
   {
@@ -411,9 +479,7 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "59e18623",
-   "metadata": {
-    "lines_to_next_cell": 2
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "model.eval()\n",
@@ -445,12 +511,58 @@
     "assert (post_pred_f.mean == post_pred_y.mean).all()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "bff55240",
+   "metadata": {},
+   "source": [
+    "When `use_noise=False`, then the GP's prediction is an almost perfect\n",
+    "reconstruction of the ground truth function (in-distribution, so where we\n",
+    "have data).\n",
+    "In this case, the plot makes the GP prediction look like a perfect\n",
+    "*interpolation* of the noise-free data, so $\\test{\\ve\\mu} = \\ve y$ at the\n",
+    "train points $\\test{\\ma X} = \\ma X$. This\n",
+    "would be true if our GP model had exactly zero noise, so the likelihood's\n",
+    "$\\sigma_n^2$ would be zero. However `print(model`)\n",
+    "\n",
+    "```\n",
+    "ExactGPModel(\n",
+    " (likelihood): GaussianLikelihood(\n",
+    "   (noise_covar): HomoskedasticNoise(\n",
+    "     (raw_noise_constraint): GreaterThan(1.000E-04)\n",
+    "   )\n",
+    " )\n",
+    " ...\n",
+    " ```\n",
+    "\n",
+    "shows that actually the min value is $10^{-4}$, so we technically always have\n",
+    "a regression setting, just with very small noise. The reason is that in the\n",
+    "GP equations, we have\n",
+    "\n",
+    "$$\\test{\\ve\\mu} = \\test{\\ma K}\\,\\left(\\ma K+\\sigma_n^2\\,\\ma I_N\\right)^{-1}\\,\\ve y$$\n",
+    "\n",
+    "where $\\sigma_n^2$ acts as a *regularization* parameter (also called \"jitter\n",
+    "term\" sometimes), which improves the\n",
+    "numerical stability of the linear system solve step\n",
+    "\n",
+    "$$\\left(\\ma K+\\sigma_n^2\\,\\ma I_N\\right)^{-1}\\,\\ve y\\:.$$\n",
+    "\n",
+    "Also we always keep $\\sigma_n^2$ as hyper parameter that we learn, and the\n",
+    "smallest value the hyper parameter optimization can reach is $10^{-4}$.\n",
+    "\n",
+    "While 3D plots are fun, they are not optimal for judging how well\n",
+    "the GP model represents the ground truth function."
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "591e453d",
    "metadata": {},
    "source": [
-    "# Plot difference to ground truth and uncertainty"
+    "# Plot difference to ground truth and uncertainty\n",
+    "\n",
+    "Let's use contour plots to visualize the difference between GP prediction and\n",
+    "ground truth, as well as epistemic, total and aleatoric uncertainty."
    ]
   },
   {
@@ -460,8 +572,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ncols = 3\n",
-    "fig, axs = plt.subplots(ncols=ncols, nrows=1, figsize=(ncols * 7, 5))\n",
+    "ncols = 4\n",
+    "fig, axs = plt.subplots(\n",
+    "    ncols=ncols, nrows=1, figsize=(ncols * 5, 4), layout=\"compressed\"\n",
+    ")\n",
     "\n",
     "vmax = post_pred_y.stddev.max()\n",
     "cs = []\n",
@@ -477,27 +591,42 @@
     ")\n",
     "axs[0].set_title(\"|y_pred - y_true|\")\n",
     "\n",
+    "f_std = post_pred_f.stddev.reshape((data_pred.nx, data_pred.ny))\n",
+    "y_std = post_pred_y.stddev.reshape((data_pred.nx, data_pred.ny))\n",
+    "\n",
     "cs.append(\n",
     "    axs[1].contourf(\n",
     "        data_pred.XG,\n",
     "        data_pred.YG,\n",
-    "        post_pred_f.stddev.reshape((data_pred.nx, data_pred.ny)),\n",
+    "        f_std,\n",
     "        vmin=0,\n",
     "        vmax=vmax,\n",
     "    )\n",
     ")\n",
-    "axs[1].set_title(\"f_std (epistemic)\")\n",
+    "axs[1].set_title(\"epistemic: f_std\")\n",
     "\n",
     "cs.append(\n",
     "    axs[2].contourf(\n",
     "        data_pred.XG,\n",
     "        data_pred.YG,\n",
-    "        post_pred_y.stddev.reshape((data_pred.nx, data_pred.ny)),\n",
+    "        y_std,\n",
     "        vmin=0,\n",
     "        vmax=vmax,\n",
     "    )\n",
     ")\n",
-    "axs[2].set_title(\"y_std (epistemic + aleatoric)\")\n",
+    "axs[2].set_title(\"total: y_std\")\n",
+    "\n",
+    "cs.append(\n",
+    "    axs[3].contourf(\n",
+    "        data_pred.XG,\n",
+    "        data_pred.YG,\n",
+    "        y_std - f_std,\n",
+    "        vmin=0,\n",
+    "        cmap=\"plasma\",\n",
+    "        ##vmax=vmax,\n",
+    "    )\n",
+    ")\n",
+    "axs[3].set_title(\"aleatoric: y_std - f_std\")\n",
     "\n",
     "for ax, c in zip(axs, cs):\n",
     "    ax.set_xlabel(\"X_0\")\n",
@@ -511,7 +640,7 @@
    "id": "04257cea",
    "metadata": {},
    "source": [
-    "# Check learned noise"
+    "## Let's check the learned noise"
    ]
   },
   {
@@ -521,13 +650,22 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "print((post_pred_y.stddev**2 - post_pred_f.stddev**2).mean().sqrt())\n",
+    "# Target noise to learn\n",
+    "print(\"data noise:\", noise_std)\n",
+    "\n",
+    "# The two below must be the same\n",
     "print(\n",
-    "    np.sqrt(\n",
-    "        extract_model_params(model, raw=False)[\"likelihood.noise_covar.noise\"]\n",
-    "    )\n",
+    "    \"learned noise:\",\n",
+    "    (post_pred_y.stddev**2 - post_pred_f.stddev**2).mean().sqrt().item(),\n",
     ")\n",
-    "print(noise_std)"
+    "print(\n",
+    "    \"learned noise:\",\n",
+    "    np.sqrt(\n",
+    "        extract_model_params(model, try_item=True)[\n",
+    "            \"likelihood.noise_covar.noise\"\n",
+    "        ]\n",
+    "    ),\n",
+    ")"
    ]
   },
   {
@@ -535,7 +673,68 @@
    "id": "1da209ff",
    "metadata": {},
    "source": [
-    "# Plot confidence bands"
+    "# Observations"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c52e65c1",
+   "metadata": {},
+   "source": [
+    "We have the following terms:\n",
+    "\n",
+    "* epistemic: `f_std` = $\\sqrt{\\diag\\test{\\ma\\Sigma}}$\n",
+    "* total: `y_std` = $\\sqrt{\\diag(\\test{\\ma\\Sigma} + \\sigma_n^2\\,\\ma I_N)}$\n",
+    "* aleatoric: we have two ways of representing it\n",
+    "  * from the likelihood: $\\sigma_n$\n",
+    "  * for plotting: we use `y_std` - `f_std`, this is $\\neq \\sigma_n$ because of the $\\sqrt{\\cdot}$\n",
+    "    above\n",
+    "\n",
+    "We can make the following observations:\n",
+    "\n",
+    "* Exercise 1: `use_noise=False`, `use_gap=False`\n",
+    "  * The epistemic uncertainty `f_std` is a good indicator\n",
+    "    of the (small) differences between model prediction and ground truth\n",
+    "  * The learned variance $\\sigma_n^2$, and hence the aleatoric uncertainty is\n",
+    "    near zero, which makes sense for noise-free data\n",
+    "* Exercise 2: `use_noise=False`, `use_gap=True`\n",
+    "  * When faced with out-of-distribution (OOD) data, the epistemic `f_std`\n",
+    "    clearly shows where the model will make wrong (less trustworthy)\n",
+    "    predictions\n",
+    "* Exercise 3: `use_noise=True`, `use_gap=True`\n",
+    "  * in-distribution (where we have data)\n",
+    "    * The distinction between\n",
+    "      epistemic and aleatoric uncertainty in the way we define it is less meaningful,\n",
+    "      hence, `f_std` doesn't correlate well with `y_pred - y_true`. The\n",
+    "      reason is that the noise $\\sigma_n$ shows up in two parts: (a) in the\n",
+    "      equation of $\\test{\\ma\\Sigma}$ itself, so the \"epistemic\" uncertainty\n",
+    "      `f_std` = $\\sqrt{\\diag\\test{\\ma\\Sigma}}$ is bigger just because we have\n",
+    "      noise (regression) and (b) we add it in $\\sqrt{\\diag(\\test{\\ma\\Sigma} +\n",
+    "      \\sigma_n^2\\,\\ma I_N)}$ to get the total `y_std`\n",
+    "    * The `y_std` plot looks like the `f_std` one, but shifted by a constant.\n",
+    "      But this is not the case because we compare standard deviations and not\n",
+    "      variances, hence `y_std` - `f_std` is not constant, and in particular\n",
+    "      $\\neq \\sigma_n$, but both are in the same numerical range (0.15 vs. 0.2).\n",
+    "  * out-of-distribution: `f_std` (epistemic) dominates"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "08e949a2",
+   "metadata": {},
+   "source": [
+    "# Exercises\n",
+    "\n",
+    "Go back up, switch on the settings for Exercise 2 and re-run the notebook.\n",
+    "Same with Exercise 3."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ee89f26a",
+   "metadata": {},
+   "source": [
+    "# Bonus: plot confidence bands"
    ]
   },
   {
@@ -587,6 +786,18 @@
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.3"
   }
  },
  "nbformat": 4,
diff --git a/BLcourse2.3/02_two_dim.py b/BLcourse2.3/02_two_dim.py
index b7ee2273a279ef69e86f6d47fb1ac71d4fb13f97..6c59344978518666d3d3b2bcd11fcea1b4a5b0d3 100644
--- a/BLcourse2.3/02_two_dim.py
+++ b/BLcourse2.3/02_two_dim.py
@@ -6,18 +6,32 @@
 #       extension: .py
 #       format_name: light
 #       format_version: '1.5'
-#       jupytext_version: 1.16.2
+#       jupytext_version: 1.17.1
 #   kernelspec:
 #     display_name: Python 3 (ipykernel)
 #     language: python
 #     name: python3
 # ---
 
-# +
-# %matplotlib notebook
-# ##%matplotlib widget
+# # About
+#
+# In this notebook, we use a GP to fit a 2D data set. We use the same ExactGP
+# machinery as in the 1D case and show how GPs can be used for 2D interpolation
+# (when data is free of noise) or regression (noisy data). Think of this as a
+# toy geospatial data setting. Actually, in geostatistics, Gaussian process
+# regression is known as [Kriging](https://en.wikipedia.org/wiki/Kriging).
+# $\newcommand{\ve}[1]{\mathit{\boldsymbol{#1}}}$
+# $\newcommand{\ma}[1]{\mathbf{#1}}$
+# $\newcommand{\pred}[1]{\rm{#1}}$
+# $\newcommand{\predve}[1]{\mathbf{#1}}$
+# $\newcommand{\test}[1]{#1_*}$
+# $\newcommand{\testtest}[1]{#1_{**}}$
+# $\DeclareMathOperator{\diag}{diag}$
+# $\DeclareMathOperator{\cov}{cov}$
+
+# ##%matplotlib notebook
+# %matplotlib widget
 # ##%matplotlib inline
-# -
 
 # +
 from collections import defaultdict
@@ -39,6 +53,13 @@ torch.set_default_dtype(torch.float64)
 torch.manual_seed(123)
 
 # # Generate toy 2D data
+#
+# Our ground truth function is $f(\ve x) = \sin(r) / r$ with $\ve x =
+# [x_0,x_1] \in\mathbb R^2$ and the radial distance
+# $r=\sqrt{\ve x^\top\,\ve x}$, also known as "Mexican hat" function, which is a
+# radial wave-like pattern which decays with distance from the center $\ve
+# x=\ve 0$. We generate data by random sampling 2D points $\ve x_i$ and calculating
+# $y_i = f(\ve x_i)$, optionally adding Gaussian noise further down.
 
 
 class MexicanHat:
@@ -112,39 +133,44 @@ X_pred = data_pred.X
 
 # # Exercise 1
 
-# +
+# Keep the settings below and explore the notebook till the end first.
+
 use_noise = False
 use_gap = False
-# -
 
 # # Exercise 2
 
+# First complete the notebook as is, then come back here.
+
 # +
-##use_noise = True
-##use_gap = False
+##use_noise = False
+##use_gap = True
 # -
 
 # # Exercise 3
 
+# First complete the notebook with Exercise 2, then come back here.
+
 # +
-##use_noise = False
+##use_noise = True
 ##use_gap = True
 # -
 
-# +
 if use_noise:
     # noisy train data
     noise_std = 0.2
     noise_dist = torch.distributions.Normal(loc=0, scale=noise_std)
-    y_train = data_train.z + noise_dist.sample_n(len(data_train.z))
+    y_train = data_train.z + noise_dist.sample(
+        sample_shape=(len(data_train.z),)
+    )
 else:
     # noise-free train data
     noise_std = 0
     y_train = data_train.z
-# -
 
 # +
-# Cut out part of the train data to create out-of-distribution predictions
+# Cut out part of the train data to create out-of-distribution predictions.
+# Same as the "gaps" we created in the 1D case.
 
 if use_gap:
     mask = (X_train[:, 0] > 0) & (X_train[:, 1] < 0)
@@ -170,6 +196,9 @@ s1 = ax.scatter(
 ax.set_xlabel("X_0")
 ax.set_ylabel("X_1")
 
+# The gray surface is the ground truth function. The blue points are the
+# training data.
+
 # # Define GP model
 
 
@@ -206,7 +235,7 @@ model = ExactGPModel(X_train, y_train, likelihood)
 print(model)
 
 # Default start hyper params
-pprint(extract_model_params(model, raw=False))
+pprint(extract_model_params(model))
 
 # +
 # Set new start hyper params
@@ -215,7 +244,7 @@ model.covar_module.base_kernel.lengthscale = 3.0
 model.covar_module.outputscale = 8.0
 model.likelihood.noise_covar.noise = 0.1
 
-pprint(extract_model_params(model, raw=False))
+pprint(extract_model_params(model))
 # -
 
 # # Fit GP to data: optimize hyper params
@@ -225,10 +254,10 @@ pprint(extract_model_params(model, raw=False))
 model.train()
 likelihood.train()
 
-optimizer = torch.optim.Adam(model.parameters(), lr=0.2)
+optimizer = torch.optim.Adam(model.parameters(), lr=0.15)
 loss_func = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)
 
-n_iter = 300
+n_iter = 400
 history = defaultdict(list)
 for ii in range(n_iter):
     optimizer.zero_grad()
@@ -236,21 +265,24 @@ for ii in range(n_iter):
     loss.backward()
     optimizer.step()
     if (ii + 1) % 10 == 0:
-        print(f"iter {ii+1}/{n_iter}, {loss=:.3f}")
-    for p_name, p_val in extract_model_params(model).items():
+        print(f"iter {ii + 1}/{n_iter}, {loss=:.3f}")
+    for p_name, p_val in extract_model_params(model, try_item=True).items():
         history[p_name].append(p_val)
     history["loss"].append(loss.item())
 # -
 
 ncols = len(history)
-fig, axs = plt.subplots(ncols=ncols, nrows=1, figsize=(ncols * 5, 5))
-for ax, (p_name, p_lst) in zip(axs, history.items()):
-    ax.plot(p_lst)
-    ax.set_title(p_name)
-    ax.set_xlabel("iterations")
+fig, axs = plt.subplots(
+    ncols=ncols, nrows=1, figsize=(ncols * 3, 3), layout="compressed"
+)
+with torch.no_grad():
+    for ax, (p_name, p_lst) in zip(axs, history.items()):
+        ax.plot(p_lst)
+        ax.set_title(p_name)
+        ax.set_xlabel("iterations")
 
 # Values of optimized hyper params
-pprint(extract_model_params(model, raw=False))
+pprint(extract_model_params(model))
 
 # # Run prediction
 
@@ -284,12 +316,53 @@ with torch.no_grad():
 assert (post_pred_f.mean == post_pred_y.mean).all()
 # -
 
+# When `use_noise=False`, then the GP's prediction is an almost perfect
+# reconstruction of the ground truth function (in-distribution, so where we
+# have data).
+# In this case, the plot makes the GP prediction look like a perfect
+# *interpolation* of the noise-free data, so $\test{\ve\mu} = \ve y$ at the
+# train points $\test{\ma X} = \ma X$. This
+# would be true if our GP model had exactly zero noise, so the likelihood's
+# $\sigma_n^2$ would be zero. However `print(model`)
+#
+# ```
+# ExactGPModel(
+#  (likelihood): GaussianLikelihood(
+#    (noise_covar): HomoskedasticNoise(
+#      (raw_noise_constraint): GreaterThan(1.000E-04)
+#    )
+#  )
+#  ...
+#  ```
+#
+# shows that actually the min value is $10^{-4}$, so we technically always have
+# a regression setting, just with very small noise. The reason is that in the
+# GP equations, we have
+#
+# $$\test{\ve\mu} = \test{\ma K}\,\left(\ma K+\sigma_n^2\,\ma I_N\right)^{-1}\,\ve y$$
+#
+# where $\sigma_n^2$ acts as a *regularization* parameter (also called "jitter
+# term" sometimes), which improves the
+# numerical stability of the linear system solve step
+#
+# $$\left(\ma K+\sigma_n^2\,\ma I_N\right)^{-1}\,\ve y\:.$$
+#
+# Also we always keep $\sigma_n^2$ as hyper parameter that we learn, and the
+# smallest value the hyper parameter optimization can reach is $10^{-4}$.
+#
+# While 3D plots are fun, they are not optimal for judging how well
+# the GP model represents the ground truth function.
 
 # # Plot difference to ground truth and uncertainty
+#
+# Let's use contour plots to visualize the difference between GP prediction and
+# ground truth, as well as epistemic, total and aleatoric uncertainty.
 
 # +
-ncols = 3
-fig, axs = plt.subplots(ncols=ncols, nrows=1, figsize=(ncols * 7, 5))
+ncols = 4
+fig, axs = plt.subplots(
+    ncols=ncols, nrows=1, figsize=(ncols * 5, 4), layout="compressed"
+)
 
 vmax = post_pred_y.stddev.max()
 cs = []
@@ -305,27 +378,42 @@ cs.append(
 )
 axs[0].set_title("|y_pred - y_true|")
 
+f_std = post_pred_f.stddev.reshape((data_pred.nx, data_pred.ny))
+y_std = post_pred_y.stddev.reshape((data_pred.nx, data_pred.ny))
+
 cs.append(
     axs[1].contourf(
         data_pred.XG,
         data_pred.YG,
-        post_pred_f.stddev.reshape((data_pred.nx, data_pred.ny)),
+        f_std,
         vmin=0,
         vmax=vmax,
     )
 )
-axs[1].set_title("f_std (epistemic)")
+axs[1].set_title("epistemic: f_std")
 
 cs.append(
     axs[2].contourf(
         data_pred.XG,
         data_pred.YG,
-        post_pred_y.stddev.reshape((data_pred.nx, data_pred.ny)),
+        y_std,
         vmin=0,
         vmax=vmax,
     )
 )
-axs[2].set_title("y_std (epistemic + aleatoric)")
+axs[2].set_title("total: y_std")
+
+cs.append(
+    axs[3].contourf(
+        data_pred.XG,
+        data_pred.YG,
+        y_std - f_std,
+        vmin=0,
+        cmap="plasma",
+        ##vmax=vmax,
+    )
+)
+axs[3].set_title("aleatoric: y_std - f_std")
 
 for ax, c in zip(axs, cs):
     ax.set_xlabel("X_0")
@@ -334,19 +422,71 @@ for ax, c in zip(axs, cs):
     fig.colorbar(c, ax=ax)
 # -
 
-# # Check learned noise
+# ## Let's check the learned noise
 
 # +
-print((post_pred_y.stddev**2 - post_pred_f.stddev**2).mean().sqrt())
+# Target noise to learn
+print("data noise:", noise_std)
+
+# The two below must be the same
+print(
+    "learned noise:",
+    (post_pred_y.stddev**2 - post_pred_f.stddev**2).mean().sqrt().item(),
+)
 print(
+    "learned noise:",
     np.sqrt(
-        extract_model_params(model, raw=False)["likelihood.noise_covar.noise"]
-    )
+        extract_model_params(model, try_item=True)[
+            "likelihood.noise_covar.noise"
+        ]
+    ),
 )
-print(noise_std)
 # -
 
-# # Plot confidence bands
+# # Observations
+
+# We have the following terms:
+#
+# * epistemic: `f_std` = $\sqrt{\diag\test{\ma\Sigma}}$
+# * total: `y_std` = $\sqrt{\diag(\test{\ma\Sigma} + \sigma_n^2\,\ma I_N)}$
+# * aleatoric: we have two ways of representing it
+#   * from the likelihood: $\sigma_n$
+#   * for plotting: we use `y_std` - `f_std`, this is $\neq \sigma_n$ because of the $\sqrt{\cdot}$
+#     above
+#
+# We can make the following observations:
+#
+# * Exercise 1: `use_noise=False`, `use_gap=False`
+#   * The epistemic uncertainty `f_std` is a good indicator
+#     of the (small) differences between model prediction and ground truth
+#   * The learned variance $\sigma_n^2$, and hence the aleatoric uncertainty is
+#     near zero, which makes sense for noise-free data
+# * Exercise 2: `use_noise=False`, `use_gap=True`
+#   * When faced with out-of-distribution (OOD) data, the epistemic `f_std`
+#     clearly shows where the model will make wrong (less trustworthy)
+#     predictions
+# * Exercise 3: `use_noise=True`, `use_gap=True`
+#   * in-distribution (where we have data)
+#     * The distinction between
+#       epistemic and aleatoric uncertainty in the way we define it is less meaningful,
+#       hence, `f_std` doesn't correlate well with `y_pred - y_true`. The
+#       reason is that the noise $\sigma_n$ shows up in two parts: (a) in the
+#       equation of $\test{\ma\Sigma}$ itself, so the "epistemic" uncertainty
+#       `f_std` = $\sqrt{\diag\test{\ma\Sigma}}$ is bigger just because we have
+#       noise (regression) and (b) we add it in $\sqrt{\diag(\test{\ma\Sigma} +
+#       \sigma_n^2\,\ma I_N)}$ to get the total `y_std`
+#     * The `y_std` plot looks like the `f_std` one, but shifted by a constant.
+#       But this is not the case because we compare standard deviations and not
+#       variances, hence `y_std` - `f_std` is not constant, and in particular
+#       $\neq \sigma_n$, but both are in the same numerical range (0.15 vs. 0.2).
+#   * out-of-distribution: `f_std` (epistemic) dominates
+
+# # Exercises
+#
+# Go back up, switch on the settings for Exercise 2 and re-run the notebook.
+# Same with Exercise 3.
+
+# # Bonus: plot confidence bands
 
 # +
 y_mean = post_pred_y.mean.reshape((data_pred.nx, data_pred.ny))
@@ -370,8 +510,6 @@ ax.set_zlim((contour_z, zlim[1] + abs(contour_z)))
 ax.contourf(data_pred.XG, data_pred.YG, y_std, zdir="z", offset=contour_z)
 # -
 
-# +
 # When running as script
 if not is_interactive():
     plt.show()
-# -
diff --git a/BLcourse2.3/03_one_dim_SVI.ipynb b/BLcourse2.3/03_one_dim_SVI.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..0520fa0366033d88482c76a09f7bf646e7ba4eed
--- /dev/null
+++ b/BLcourse2.3/03_one_dim_SVI.ipynb
@@ -0,0 +1,536 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "b146bc52",
+   "metadata": {},
+   "source": [
+    "# About\n",
+    "\n",
+    "In this notebook, we replace the ExactGP inference and log marginal\n",
+    "likelihood optimization by using sparse stochastic variational inference.\n",
+    "This serves as an example of the many methods `gpytorch` offers to make GPs\n",
+    "scale to large data sets.\n",
+    "$\\newcommand{\\ve}[1]{\\mathit{\\boldsymbol{#1}}}$\n",
+    "$\\newcommand{\\ma}[1]{\\mathbf{#1}}$\n",
+    "$\\newcommand{\\pred}[1]{\\rm{#1}}$\n",
+    "$\\newcommand{\\predve}[1]{\\mathbf{#1}}$\n",
+    "$\\newcommand{\\test}[1]{#1_*}$\n",
+    "$\\newcommand{\\testtest}[1]{#1_{**}}$\n",
+    "$\\DeclareMathOperator{\\diag}{diag}$\n",
+    "$\\DeclareMathOperator{\\cov}{cov}$"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f96e3304",
+   "metadata": {},
+   "source": [
+    "# Imports, helpers, setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "193b2f13",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "##%matplotlib notebook\n",
+    "%matplotlib widget\n",
+    "##%matplotlib inline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c11c1030",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import math\n",
+    "from collections import defaultdict\n",
+    "from pprint import pprint\n",
+    "\n",
+    "import torch\n",
+    "import gpytorch\n",
+    "from matplotlib import pyplot as plt\n",
+    "from matplotlib import is_interactive\n",
+    "import numpy as np\n",
+    "from torch.utils.data import TensorDataset, DataLoader\n",
+    "\n",
+    "from utils import extract_model_params, plot_samples\n",
+    "\n",
+    "\n",
+    "torch.set_default_dtype(torch.float64)\n",
+    "torch.manual_seed(123)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "460d4c3d",
+   "metadata": {
+    "lines_to_next_cell": 2
+   },
+   "source": [
+    "# Generate toy 1D data\n",
+    "\n",
+    "Now we generate 10x more points as in the ExactGP case, still the inference\n",
+    "won't be much slower (exact GPs scale roughly as $N^3$). Note that the data we\n",
+    "use here is still tiny (1000 points is easy even for exact GPs), so the\n",
+    "method's usefulness cannot be fully exploited in our example\n",
+    "-- also we don't even use a GPU yet :)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f1a9647d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def ground_truth(x, const):\n",
+    "    return torch.sin(x) * torch.exp(-0.2 * x) + const\n",
+    "\n",
+    "\n",
+    "def generate_data(x, gaps=[[1, 3]], const=None, noise_std=None):\n",
+    "    noise_dist = torch.distributions.Normal(loc=0, scale=noise_std)\n",
+    "    y = ground_truth(x, const=const) + noise_dist.sample(\n",
+    "        sample_shape=(len(x),)\n",
+    "    )\n",
+    "    msk = torch.tensor([True] * len(x))\n",
+    "    if gaps is not None:\n",
+    "        for g in gaps:\n",
+    "            msk = msk & ~((x > g[0]) & (x < g[1]))\n",
+    "    return x[msk], y[msk], y\n",
+    "\n",
+    "\n",
+    "const = 5.0\n",
+    "noise_std = 0.1\n",
+    "x = torch.linspace(0, 4 * math.pi, 1000)\n",
+    "X_train, y_train, y_gt_train = generate_data(\n",
+    "    x, gaps=[[6, 10]], const=const, noise_std=noise_std\n",
+    ")\n",
+    "X_pred = torch.linspace(\n",
+    "    X_train[0] - 2, X_train[-1] + 2, 200, requires_grad=False\n",
+    ")\n",
+    "y_gt_pred = ground_truth(X_pred, const=const)\n",
+    "\n",
+    "print(f\"{X_train.shape=}\")\n",
+    "print(f\"{y_train.shape=}\")\n",
+    "print(f\"{X_pred.shape=}\")\n",
+    "\n",
+    "fig, ax = plt.subplots()\n",
+    "ax.scatter(X_train, y_train, marker=\"o\", color=\"tab:blue\", label=\"noisy data\")\n",
+    "ax.plot(X_pred, y_gt_pred, ls=\"--\", color=\"k\", label=\"ground truth\")\n",
+    "ax.legend()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4ad60c5e",
+   "metadata": {
+    "lines_to_next_cell": 2
+   },
+   "source": [
+    "# Define GP model\n",
+    "\n",
+    "The model follows [this\n",
+    "example](https://docs.gpytorch.ai/en/stable/examples/04_Variational_and_Approximate_GPs/SVGP_Regression_CUDA.html)\n",
+    "based on [Hensman et al., \"Scalable Variational Gaussian Process Classification\",\n",
+    "2015](https://proceedings.mlr.press/v38/hensman15.html). The model is\n",
+    "\"sparse\" since it works with a set of *inducing* points $(\\ma Z, \\ve u),\n",
+    "\\ve u=f(\\ma Z)$ which is much smaller than the train data $(\\ma X, \\ve y)$.\n",
+    "\n",
+    "We have the same hyper parameters as before\n",
+    "\n",
+    "* $\\ell$ = `model.covar_module.base_kernel.lengthscale`\n",
+    "* $\\sigma_n^2$ = `likelihood.noise_covar.noise`\n",
+    "* $s$ = `model.covar_module.outputscale`\n",
+    "* $m(\\ve x) = c$ = `model.mean_module.constant`\n",
+    "\n",
+    "plus additional ones, introduced by the approximations used:\n",
+    "\n",
+    "* the learnable inducing points $\\ma Z$ for the variational distribution\n",
+    "  $q_{\\ve\\psi}(\\ve u)$\n",
+    "* learnable parameters of the variational distribution $q_{\\ve\\psi}(\\ve u)=\\mathcal N(\\ve m_u, \\ma S)$: the\n",
+    "  variational mean $\\ve m_u$ and covariance $\\ma S$ in form a lower triangular\n",
+    "  matrix $\\ma L$ such that $\\ma S=\\ma L\\,\\ma L^\\top$"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "444929a3",
+   "metadata": {
+    "lines_to_next_cell": 2
+   },
+   "outputs": [],
+   "source": [
+    "class ApproxGPModel(gpytorch.models.ApproximateGP):\n",
+    "    def __init__(self, Z):\n",
+    "        # Approximate inducing value posterior q(u), u = f(Z), Z = inducing\n",
+    "        # points (subset of X_train)\n",
+    "        variational_distribution = (\n",
+    "            gpytorch.variational.CholeskyVariationalDistribution(Z.size(0))\n",
+    "        )\n",
+    "        # Compute q(f(X)) from q(u)\n",
+    "        variational_strategy = gpytorch.variational.VariationalStrategy(\n",
+    "            self,\n",
+    "            Z,\n",
+    "            variational_distribution,\n",
+    "            learn_inducing_locations=True,\n",
+    "        )\n",
+    "        super().__init__(variational_strategy)\n",
+    "        self.mean_module = gpytorch.means.ConstantMean()\n",
+    "        self.covar_module = gpytorch.kernels.ScaleKernel(\n",
+    "            gpytorch.kernels.RBFKernel()\n",
+    "        )\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        mean_x = self.mean_module(x)\n",
+    "        covar_x = self.covar_module(x)\n",
+    "        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)\n",
+    "\n",
+    "\n",
+    "likelihood = gpytorch.likelihoods.GaussianLikelihood()\n",
+    "\n",
+    "n_train = len(X_train)\n",
+    "# Start values for inducing points Z, use 10% random sub-sample of X_train.\n",
+    "ind_points_fraction = 0.1\n",
+    "ind_idxs = torch.randperm(n_train)[: int(n_train * ind_points_fraction)]\n",
+    "model = ApproxGPModel(Z=X_train[ind_idxs])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5861163d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Inspect the model\n",
+    "print(model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "98032457",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Inspect the likelihood. In contrast to ExactGP, the likelihood is not part of\n",
+    "# the GP model instance.\n",
+    "print(likelihood)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7ec97687",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Default start hyper params\n",
+    "print(\"model params:\")\n",
+    "pprint(extract_model_params(model))\n",
+    "print(\"likelihood params:\")\n",
+    "pprint(extract_model_params(likelihood))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "167d584c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Set new start hyper params (scalars only)\n",
+    "model.mean_module.constant = 3.0\n",
+    "model.covar_module.base_kernel.lengthscale = 1.0\n",
+    "model.covar_module.outputscale = 1.0\n",
+    "likelihood.noise_covar.noise = 0.3"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6704ce2f",
+   "metadata": {},
+   "source": [
+    "# Fit GP to data: optimize hyper params\n",
+    "\n",
+    "Now we optimize the GP hyper parameters by doing a GP-specific variational inference (VI),\n",
+    "where we optimize not the log marginal likelihood (ExactGP case),\n",
+    "but an ELBO (evidence lower bound) objective\n",
+    "\n",
+    "$$\n",
+    "\\max_\\ve\\zeta\\left(\\mathbb E_{q_{\\ve\\psi}(\\ve u)}\\left[\\ln p(\\ve y|\\ve u) \\right] -\n",
+    "D_{\\text{KL}}(q_{\\ve\\psi}(\\ve u)\\Vert p(\\ve u))\\right)\n",
+    "$$\n",
+    "\n",
+    "with respect to\n",
+    "\n",
+    "$$\\ve\\zeta = [\\ell, \\sigma_n^2, s, c, \\ve\\psi] $$\n",
+    "\n",
+    "with\n",
+    "\n",
+    "$$\\ve\\psi = [\\ve m_u, \\ma Z, \\ma L]$$\n",
+    "\n",
+    "the parameters of the variational distribution $q_{\\ve\\psi}(\\ve u)$.\n",
+    "\n",
+    "In addition, we perform a stochastic\n",
+    "optimization by using a deep learning type mini-batch loop, hence\n",
+    "\"stochastic\" variational inference (SVI). The latter speeds up the\n",
+    "optimization since we only look at a fraction of data per optimizer step to\n",
+    "calculate an approximate loss gradient (`loss.backward()`)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f39d86f8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Train mode\n",
+    "model.train()\n",
+    "likelihood.train()\n",
+    "\n",
+    "optimizer = torch.optim.Adam(\n",
+    "    [dict(params=model.parameters()), dict(params=likelihood.parameters())],\n",
+    "    lr=0.1,\n",
+    ")\n",
+    "loss_func = gpytorch.mlls.VariationalELBO(\n",
+    "    likelihood, model, num_data=X_train.shape[0]\n",
+    ")\n",
+    "\n",
+    "train_dl = DataLoader(\n",
+    "    TensorDataset(X_train, y_train), batch_size=128, shuffle=True\n",
+    ")\n",
+    "\n",
+    "n_iter = 200\n",
+    "history = defaultdict(list)\n",
+    "for i_iter in range(n_iter):\n",
+    "    for i_batch, (X_batch, y_batch) in enumerate(train_dl):\n",
+    "        batch_history = defaultdict(list)\n",
+    "        optimizer.zero_grad()\n",
+    "        loss = -loss_func(model(X_batch), y_batch)\n",
+    "        loss.backward()\n",
+    "        optimizer.step()\n",
+    "        param_dct = dict()\n",
+    "        param_dct.update(extract_model_params(model, try_item=True))\n",
+    "        param_dct.update(extract_model_params(likelihood, try_item=True))\n",
+    "        for p_name, p_val in param_dct.items():\n",
+    "            if isinstance(p_val, float):\n",
+    "                batch_history[p_name].append(p_val)\n",
+    "        batch_history[\"loss\"].append(loss.item())\n",
+    "    for p_name, p_lst in batch_history.items():\n",
+    "        history[p_name].append(np.mean(p_lst))\n",
+    "    if (i_iter + 1) % 10 == 0:\n",
+    "        print(f\"iter {i_iter + 1}/{n_iter}, {loss=:.3f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dc39b5d9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Plot scalar hyper params and loss (ELBO) convergence\n",
+    "ncols = len(history)\n",
+    "fig, axs = plt.subplots(\n",
+    "    ncols=ncols, nrows=1, figsize=(ncols * 3, 3), layout=\"compressed\"\n",
+    ")\n",
+    "with torch.no_grad():\n",
+    "    for ax, (p_name, p_lst) in zip(axs, history.items()):\n",
+    "        ax.plot(p_lst)\n",
+    "        ax.set_title(p_name)\n",
+    "        ax.set_xlabel(\"iterations\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c4907e39",
+   "metadata": {
+    "lines_to_next_cell": 2
+   },
+   "outputs": [],
+   "source": [
+    "# Values of optimized hyper params\n",
+    "print(\"model params:\")\n",
+    "pprint(extract_model_params(model))\n",
+    "print(\"likelihood params:\")\n",
+    "pprint(extract_model_params(likelihood))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4ae35e11",
+   "metadata": {},
+   "source": [
+    "# Run prediction"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "629b4658",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Evaluation (predictive posterior) mode\n",
+    "model.eval()\n",
+    "likelihood.eval()\n",
+    "\n",
+    "with torch.no_grad():\n",
+    "    M = 10\n",
+    "    post_pred_f = model(X_pred)\n",
+    "    post_pred_y = likelihood(model(X_pred))\n",
+    "\n",
+    "    fig, axs = plt.subplots(ncols=2, figsize=(14, 5), sharex=True, sharey=True)\n",
+    "    fig_sigmas, ax_sigmas = plt.subplots()\n",
+    "    for ii, (ax, post_pred, name, title) in enumerate(\n",
+    "        zip(\n",
+    "            axs,\n",
+    "            [post_pred_f, post_pred_y],\n",
+    "            [\"f\", \"y\"],\n",
+    "            [\"epistemic uncertainty\", \"total uncertainty\"],\n",
+    "        )\n",
+    "    ):\n",
+    "        yf_mean = post_pred.mean\n",
+    "        yf_samples = post_pred.sample(sample_shape=torch.Size((M,)))\n",
+    "\n",
+    "        yf_std = post_pred.stddev\n",
+    "        lower = yf_mean - 2 * yf_std\n",
+    "        upper = yf_mean + 2 * yf_std\n",
+    "        ax.plot(\n",
+    "            X_train.numpy(),\n",
+    "            y_train.numpy(),\n",
+    "            \"o\",\n",
+    "            label=\"data\",\n",
+    "            color=\"tab:blue\",\n",
+    "        )\n",
+    "        ax.plot(\n",
+    "            X_pred.numpy(),\n",
+    "            yf_mean.numpy(),\n",
+    "            label=\"mean\",\n",
+    "            color=\"tab:red\",\n",
+    "            lw=2,\n",
+    "        )\n",
+    "        ax.plot(\n",
+    "            X_pred.numpy(),\n",
+    "            y_gt_pred.numpy(),\n",
+    "            label=\"ground truth\",\n",
+    "            color=\"k\",\n",
+    "            lw=2,\n",
+    "            ls=\"--\",\n",
+    "        )\n",
+    "        ax.fill_between(\n",
+    "            X_pred.numpy(),\n",
+    "            lower.numpy(),\n",
+    "            upper.numpy(),\n",
+    "            label=\"confidence\",\n",
+    "            color=\"tab:orange\",\n",
+    "            alpha=0.3,\n",
+    "        )\n",
+    "        ax.set_title(f\"confidence = {title}\")\n",
+    "        if name == \"f\":\n",
+    "            sigma_label = r\"epistemic: $\\pm 2\\sqrt{\\mathrm{diag}(\\Sigma_*)}$\"\n",
+    "            zorder = 1\n",
+    "        else:\n",
+    "            sigma_label = (\n",
+    "                r\"total: $\\pm 2\\sqrt{\\mathrm{diag}(\\Sigma_* + \\sigma_n^2\\,I)}$\"\n",
+    "            )\n",
+    "            zorder = 0\n",
+    "        ax_sigmas.fill_between(\n",
+    "            X_pred.numpy(),\n",
+    "            lower.numpy(),\n",
+    "            upper.numpy(),\n",
+    "            label=sigma_label,\n",
+    "            color=\"tab:orange\" if name == \"f\" else \"tab:blue\",\n",
+    "            alpha=0.5,\n",
+    "            zorder=zorder,\n",
+    "        )\n",
+    "        y_min = y_train.min()\n",
+    "        y_max = y_train.max()\n",
+    "        y_span = y_max - y_min\n",
+    "        ax.set_ylim([y_min - 0.3 * y_span, y_max + 0.3 * y_span])\n",
+    "        plot_samples(ax, X_pred, yf_samples, label=\"posterior pred. samples\")\n",
+    "        if ii == 1:\n",
+    "            ax.legend()\n",
+    "    ax_sigmas.set_title(\"total vs. epistemic uncertainty\")\n",
+    "    ax_sigmas.legend()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "60abb7ec",
+   "metadata": {},
+   "source": [
+    "# Let's check the learned noise"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d8a32f5b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Target noise to learn\n",
+    "print(\"data noise:\", noise_std)\n",
+    "\n",
+    "# The two below must be the same\n",
+    "print(\n",
+    "    \"learned noise:\",\n",
+    "    (post_pred_y.stddev**2 - post_pred_f.stddev**2).mean().sqrt().item(),\n",
+    ")\n",
+    "print(\n",
+    "    \"learned noise:\",\n",
+    "    np.sqrt(\n",
+    "        extract_model_params(likelihood, try_item=True)[\"noise_covar.noise\"]\n",
+    "    ),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0637729a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# When running as script\n",
+    "if not is_interactive():\n",
+    "    plt.show()"
+   ]
+  }
+ ],
+ "metadata": {
+  "jupytext": {
+   "formats": "ipynb,py:light"
+  },
+  "kernelspec": {
+   "display_name": "bayes-ml-course",
+   "language": "python",
+   "name": "bayes-ml-course"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/BLcourse2.3/03_one_dim_SVI.py b/BLcourse2.3/03_one_dim_SVI.py
new file mode 100644
index 0000000000000000000000000000000000000000..644e61a37b818301dd4604bb25d74b64cb5be511
--- /dev/null
+++ b/BLcourse2.3/03_one_dim_SVI.py
@@ -0,0 +1,376 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:light
+#     text_representation:
+#       extension: .py
+#       format_name: light
+#       format_version: '1.5'
+#       jupytext_version: 1.17.1
+#   kernelspec:
+#     display_name: bayes-ml-course
+#     language: python
+#     name: bayes-ml-course
+# ---
+
+# # About
+#
+# In this notebook, we replace the ExactGP inference and log marginal
+# likelihood optimization by using sparse stochastic variational inference.
+# This serves as an example of the many methods `gpytorch` offers to make GPs
+# scale to large data sets.
+# $\newcommand{\ve}[1]{\mathit{\boldsymbol{#1}}}$
+# $\newcommand{\ma}[1]{\mathbf{#1}}$
+# $\newcommand{\pred}[1]{\rm{#1}}$
+# $\newcommand{\predve}[1]{\mathbf{#1}}$
+# $\newcommand{\test}[1]{#1_*}$
+# $\newcommand{\testtest}[1]{#1_{**}}$
+# $\DeclareMathOperator{\diag}{diag}$
+# $\DeclareMathOperator{\cov}{cov}$
+
+# # Imports, helpers, setup
+
+# ##%matplotlib notebook
+# %matplotlib widget
+# ##%matplotlib inline
+
+# +
+import math
+from collections import defaultdict
+from pprint import pprint
+
+import torch
+import gpytorch
+from matplotlib import pyplot as plt
+from matplotlib import is_interactive
+import numpy as np
+from torch.utils.data import TensorDataset, DataLoader
+
+from utils import extract_model_params, plot_samples
+
+
+torch.set_default_dtype(torch.float64)
+torch.manual_seed(123)
+# -
+
+# # Generate toy 1D data
+#
+# Now we generate 10x more points as in the ExactGP case, still the inference
+# won't be much slower (exact GPs scale roughly as $N^3$). Note that the data we
+# use here is still tiny (1000 points is easy even for exact GPs), so the
+# method's usefulness cannot be fully exploited in our example
+# -- also we don't even use a GPU yet :).
+
+
+# +
+def ground_truth(x, const):
+    return torch.sin(x) * torch.exp(-0.2 * x) + const
+
+
+def generate_data(x, gaps=[[1, 3]], const=None, noise_std=None):
+    noise_dist = torch.distributions.Normal(loc=0, scale=noise_std)
+    y = ground_truth(x, const=const) + noise_dist.sample(
+        sample_shape=(len(x),)
+    )
+    msk = torch.tensor([True] * len(x))
+    if gaps is not None:
+        for g in gaps:
+            msk = msk & ~((x > g[0]) & (x < g[1]))
+    return x[msk], y[msk], y
+
+
+const = 5.0
+noise_std = 0.1
+x = torch.linspace(0, 4 * math.pi, 1000)
+X_train, y_train, y_gt_train = generate_data(
+    x, gaps=[[6, 10]], const=const, noise_std=noise_std
+)
+X_pred = torch.linspace(
+    X_train[0] - 2, X_train[-1] + 2, 200, requires_grad=False
+)
+y_gt_pred = ground_truth(X_pred, const=const)
+
+print(f"{X_train.shape=}")
+print(f"{y_train.shape=}")
+print(f"{X_pred.shape=}")
+
+fig, ax = plt.subplots()
+ax.scatter(X_train, y_train, marker="o", color="tab:blue", label="noisy data")
+ax.plot(X_pred, y_gt_pred, ls="--", color="k", label="ground truth")
+ax.legend()
+# -
+
+# # Define GP model
+#
+# The model follows [this
+# example](https://docs.gpytorch.ai/en/stable/examples/04_Variational_and_Approximate_GPs/SVGP_Regression_CUDA.html)
+# based on [Hensman et al., "Scalable Variational Gaussian Process Classification",
+# 2015](https://proceedings.mlr.press/v38/hensman15.html). The model is
+# "sparse" since it works with a set of *inducing* points $(\ma Z, \ve u),
+# \ve u=f(\ma Z)$ which is much smaller than the train data $(\ma X, \ve y)$.
+#
+# We have the same hyper parameters as before
+#
+# * $\ell$ = `model.covar_module.base_kernel.lengthscale`
+# * $\sigma_n^2$ = `likelihood.noise_covar.noise`
+# * $s$ = `model.covar_module.outputscale`
+# * $m(\ve x) = c$ = `model.mean_module.constant`
+#
+# plus additional ones, introduced by the approximations used:
+#
+# * the learnable inducing points $\ma Z$ for the variational distribution
+#   $q_{\ve\psi}(\ve u)$
+# * learnable parameters of the variational distribution $q_{\ve\psi}(\ve u)=\mathcal N(\ve m_u, \ma S)$: the
+#   variational mean $\ve m_u$ and covariance $\ma S$ in form a lower triangular
+#   matrix $\ma L$ such that $\ma S=\ma L\,\ma L^\top$
+
+
+# +
+class ApproxGPModel(gpytorch.models.ApproximateGP):
+    def __init__(self, Z):
+        # Approximate inducing value posterior q(u), u = f(Z), Z = inducing
+        # points (subset of X_train)
+        variational_distribution = (
+            gpytorch.variational.CholeskyVariationalDistribution(Z.size(0))
+        )
+        # Compute q(f(X)) from q(u)
+        variational_strategy = gpytorch.variational.VariationalStrategy(
+            self,
+            Z,
+            variational_distribution,
+            learn_inducing_locations=True,
+        )
+        super().__init__(variational_strategy)
+        self.mean_module = gpytorch.means.ConstantMean()
+        self.covar_module = gpytorch.kernels.ScaleKernel(
+            gpytorch.kernels.RBFKernel()
+        )
+
+    def forward(self, x):
+        mean_x = self.mean_module(x)
+        covar_x = self.covar_module(x)
+        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)
+
+
+likelihood = gpytorch.likelihoods.GaussianLikelihood()
+
+n_train = len(X_train)
+# Start values for inducing points Z, use 10% random sub-sample of X_train.
+ind_points_fraction = 0.1
+ind_idxs = torch.randperm(n_train)[: int(n_train * ind_points_fraction)]
+model = ApproxGPModel(Z=X_train[ind_idxs])
+# -
+
+
+# Inspect the model
+print(model)
+
+# Inspect the likelihood. In contrast to ExactGP, the likelihood is not part of
+# the GP model instance.
+print(likelihood)
+
+# Default start hyper params
+print("model params:")
+pprint(extract_model_params(model))
+print("likelihood params:")
+pprint(extract_model_params(likelihood))
+
+# Set new start hyper params (scalars only)
+model.mean_module.constant = 3.0
+model.covar_module.base_kernel.lengthscale = 1.0
+model.covar_module.outputscale = 1.0
+likelihood.noise_covar.noise = 0.3
+
+# # Fit GP to data: optimize hyper params
+#
+# Now we optimize the GP hyper parameters by doing a GP-specific variational inference (VI),
+# where we optimize not the log marginal likelihood (ExactGP case),
+# but an ELBO (evidence lower bound) objective
+#
+# $$
+# \max_\ve\zeta\left(\mathbb E_{q_{\ve\psi}(\ve u)}\left[\ln p(\ve y|\ve u) \right] -
+# D_{\text{KL}}(q_{\ve\psi}(\ve u)\Vert p(\ve u))\right)
+# $$
+#
+# with respect to
+#
+# $$\ve\zeta = [\ell, \sigma_n^2, s, c, \ve\psi] $$
+#
+# with
+#
+# $$\ve\psi = [\ve m_u, \ma Z, \ma L]$$
+#
+# the parameters of the variational distribution $q_{\ve\psi}(\ve u)$.
+#
+# In addition, we perform a stochastic
+# optimization by using a deep learning type mini-batch loop, hence
+# "stochastic" variational inference (SVI). The latter speeds up the
+# optimization since we only look at a fraction of data per optimizer step to
+# calculate an approximate loss gradient (`loss.backward()`).
+
+# +
+# Train mode
+model.train()
+likelihood.train()
+
+optimizer = torch.optim.Adam(
+    [dict(params=model.parameters()), dict(params=likelihood.parameters())],
+    lr=0.1,
+)
+loss_func = gpytorch.mlls.VariationalELBO(
+    likelihood, model, num_data=X_train.shape[0]
+)
+
+train_dl = DataLoader(
+    TensorDataset(X_train, y_train), batch_size=128, shuffle=True
+)
+
+n_iter = 200
+history = defaultdict(list)
+for i_iter in range(n_iter):
+    for i_batch, (X_batch, y_batch) in enumerate(train_dl):
+        batch_history = defaultdict(list)
+        optimizer.zero_grad()
+        loss = -loss_func(model(X_batch), y_batch)
+        loss.backward()
+        optimizer.step()
+        param_dct = dict()
+        param_dct.update(extract_model_params(model, try_item=True))
+        param_dct.update(extract_model_params(likelihood, try_item=True))
+        for p_name, p_val in param_dct.items():
+            if isinstance(p_val, float):
+                batch_history[p_name].append(p_val)
+        batch_history["loss"].append(loss.item())
+    for p_name, p_lst in batch_history.items():
+        history[p_name].append(np.mean(p_lst))
+    if (i_iter + 1) % 10 == 0:
+        print(f"iter {i_iter + 1}/{n_iter}, {loss=:.3f}")
+# -
+
+# Plot scalar hyper params and loss (ELBO) convergence
+ncols = len(history)
+fig, axs = plt.subplots(
+    ncols=ncols, nrows=1, figsize=(ncols * 3, 3), layout="compressed"
+)
+with torch.no_grad():
+    for ax, (p_name, p_lst) in zip(axs, history.items()):
+        ax.plot(p_lst)
+        ax.set_title(p_name)
+        ax.set_xlabel("iterations")
+
+# Values of optimized hyper params
+print("model params:")
+pprint(extract_model_params(model))
+print("likelihood params:")
+pprint(extract_model_params(likelihood))
+
+
+# # Run prediction
+
+# +
+# Evaluation (predictive posterior) mode
+model.eval()
+likelihood.eval()
+
+with torch.no_grad():
+    M = 10
+    post_pred_f = model(X_pred)
+    post_pred_y = likelihood(model(X_pred))
+
+    fig, axs = plt.subplots(ncols=2, figsize=(14, 5), sharex=True, sharey=True)
+    fig_sigmas, ax_sigmas = plt.subplots()
+    for ii, (ax, post_pred, name, title) in enumerate(
+        zip(
+            axs,
+            [post_pred_f, post_pred_y],
+            ["f", "y"],
+            ["epistemic uncertainty", "total uncertainty"],
+        )
+    ):
+        yf_mean = post_pred.mean
+        yf_samples = post_pred.sample(sample_shape=torch.Size((M,)))
+
+        yf_std = post_pred.stddev
+        lower = yf_mean - 2 * yf_std
+        upper = yf_mean + 2 * yf_std
+        ax.plot(
+            X_train.numpy(),
+            y_train.numpy(),
+            "o",
+            label="data",
+            color="tab:blue",
+        )
+        ax.plot(
+            X_pred.numpy(),
+            yf_mean.numpy(),
+            label="mean",
+            color="tab:red",
+            lw=2,
+        )
+        ax.plot(
+            X_pred.numpy(),
+            y_gt_pred.numpy(),
+            label="ground truth",
+            color="k",
+            lw=2,
+            ls="--",
+        )
+        ax.fill_between(
+            X_pred.numpy(),
+            lower.numpy(),
+            upper.numpy(),
+            label="confidence",
+            color="tab:orange",
+            alpha=0.3,
+        )
+        ax.set_title(f"confidence = {title}")
+        if name == "f":
+            sigma_label = r"epistemic: $\pm 2\sqrt{\mathrm{diag}(\Sigma_*)}$"
+            zorder = 1
+        else:
+            sigma_label = (
+                r"total: $\pm 2\sqrt{\mathrm{diag}(\Sigma_* + \sigma_n^2\,I)}$"
+            )
+            zorder = 0
+        ax_sigmas.fill_between(
+            X_pred.numpy(),
+            lower.numpy(),
+            upper.numpy(),
+            label=sigma_label,
+            color="tab:orange" if name == "f" else "tab:blue",
+            alpha=0.5,
+            zorder=zorder,
+        )
+        y_min = y_train.min()
+        y_max = y_train.max()
+        y_span = y_max - y_min
+        ax.set_ylim([y_min - 0.3 * y_span, y_max + 0.3 * y_span])
+        plot_samples(ax, X_pred, yf_samples, label="posterior pred. samples")
+        if ii == 1:
+            ax.legend()
+    ax_sigmas.set_title("total vs. epistemic uncertainty")
+    ax_sigmas.legend()
+# -
+
+# # Let's check the learned noise
+
+# +
+# Target noise to learn
+print("data noise:", noise_std)
+
+# The two below must be the same
+print(
+    "learned noise:",
+    (post_pred_y.stddev**2 - post_pred_f.stddev**2).mean().sqrt().item(),
+)
+print(
+    "learned noise:",
+    np.sqrt(
+        extract_model_params(likelihood, try_item=True)["noise_covar.noise"]
+    ),
+)
+# -
+
+# When running as script
+if not is_interactive():
+    plt.show()
diff --git a/BLcourse2.3/README.md b/BLcourse2.3/README.md
index 21766d2c861fc568cce33e935241b908c3ce8938..28873335ed97e9dc83c448032e79fa6b25bc531d 100644
--- a/BLcourse2.3/README.md
+++ b/BLcourse2.3/README.md
@@ -8,4 +8,19 @@ $ jupyter-notebook notebook.ipynb
 ```
 
 For convenience the ipynb file is included. Please keep it in sync with the py
-file using jupytext.
+file using `jupytext` (see `convert-to-ipynb.sh`).
+
+One-time setup of venv and ipy kernel:
+
+```sh
+# or
+#   $ python -m venv --system-site-packages bayes-ml-course-sys
+#   $ source ./bayes-ml-course-sys/bin/activate
+$ mkvirtualenv --system-site-packages bayes-ml-course-sys
+$ pip install -r requirements.txt
+
+# Install custom kernel, select that in Jupyter. --sys-prefix installs into the
+# current venv, while --user would install into ~/.local/share/jupyter/kernels/
+$ python -m ipykernel install --name bayes-ml-course --sys-prefix
+Installed kernelspec bayes-ml-course in /home/elcorto/.virtualenvs/bayes-ml-course-sys/share/jupyter/kernels/bayes-ml-course
+```
diff --git a/BLcourse2.3/convert-to-ipynb.sh b/BLcourse2.3/convert-to-ipynb.sh
index be23e43b6e01e4ad115436765c007a8a12c78684..47d79cabe4e792141ee334421a2264913ce6afba 100755
--- a/BLcourse2.3/convert-to-ipynb.sh
+++ b/BLcourse2.3/convert-to-ipynb.sh
@@ -1,5 +1,10 @@
 #!/bin/sh
 
-for fn in 0*dim.py; do
-    jupytext --to ipynb $fn
+# Remove plots etc, but keep random cell IDs. We need this b/c we use jupytext
+# --update below.
+nbstripout --keep-id *.ipynb
+
+for fn in $(ls -1 | grep -E '[0-9]+.+\.py'); do
+    # --update keeps call IDs -> smaller diffs
+    jupytext --to ipynb --update $fn
 done
diff --git a/BLcourse2.3/requirements.txt b/BLcourse2.3/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2a0a563ac5d2dd8a06ab84bdc9d5cb6ad6445ea6
--- /dev/null
+++ b/BLcourse2.3/requirements.txt
@@ -0,0 +1,39 @@
+# ----------------------------------------------------------------------------
+# Required to at least run the py scripts
+# ----------------------------------------------------------------------------
+torch
+gpytorch
+pyro-ppl
+matplotlib
+scikit-learn
+
+# ----------------------------------------------------------------------------
+# Jupyter stuff, this must be present in the used Jupyter kernel on a hosted
+# service like https://jupyter.jsc.fz-juelich.de
+# ----------------------------------------------------------------------------
+
+# Since we use paired notebooks. This is used for local dev, not really needed
+# on a hosted service, but still useful to avoid confusing warnings (jupytext
+# extension failed loading).
+jupytext
+
+# for %matplotlib widget
+ipympl
+
+# ----------------------------------------------------------------------------
+# Jupyter Lab, when running locally
+# ----------------------------------------------------------------------------
+
+# Jupyter Lab
+jupyter
+
+# or Jupyter notebook is also fine
+##notebook
+
+# ----------------------------------------------------------------------------
+# dev
+# ----------------------------------------------------------------------------
+
+nbstripout
+
+# vim:syn=sh
diff --git a/BLcourse2.3/utils.py b/BLcourse2.3/utils.py
index fc512d45c86aeb380087bd297aed0b3b29d84d7b..260be47d40d412b26d53a8a7db5e86c3e0996903 100644
--- a/BLcourse2.3/utils.py
+++ b/BLcourse2.3/utils.py
@@ -1,7 +1,8 @@
 from matplotlib import pyplot as plt
+import torch
 
 
-def extract_model_params(model, raw=False) -> dict:
+def extract_model_params(model, raw=False, try_item=False) -> dict:
     """Helper to convert model.named_parameters() to dict.
 
     With raw=True, use
@@ -11,9 +12,25 @@ def extract_model_params(model, raw=False) -> dict:
 
     See https://docs.gpytorch.ai/en/stable/examples/00_Basic_Usage/Hyperparameters.html#Raw-vs-Actual-Parameters
     """
+    if try_item:
+
+        def get_val(p):
+            if isinstance(p, torch.Tensor):
+                if p.ndim == 0:
+                    return p.item()
+                else:
+                    p_sq = p.squeeze()
+                    if (p_sq.ndim == 1 and len(p_sq) == 1) or p_sq.ndim == 0:
+                        return p_sq.item()
+                    else:
+                        return p
+            else:
+                return p
+    else:
+        get_val = lambda p: p
     if raw:
         return dict(
-            (p_name, p_val.item())
+            (p_name, get_val(p_val))
             for p_name, p_val in model.named_parameters()
         )
     else:
@@ -24,7 +41,7 @@ def extract_model_params(model, raw=False) -> dict:
             # Yes, eval() hack. Sorry.
             p_name = p_name.replace(".raw_", ".")
             p_val = eval(f"model.{p_name}")
-            out[p_name] = p_val.item()
+            out[p_name] = get_val(p_val)
         return out
 
 
diff --git a/slides/BLcourse2.3.pdf b/slides/BLcourse2.3.pdf
index 9e1b7f8cb80b4521c9b003cfec3fd8288ac81d5f..a6ba5a21a17b725248be9e9481bd3dbfbe474474 100644
Binary files a/slides/BLcourse2.3.pdf and b/slides/BLcourse2.3.pdf differ