diff --git a/BLcourse2.3/01_one_dim.ipynb b/BLcourse2.3/01_one_dim.ipynb
index 896932037c6a6e7d5e4df952f404e7eed261c254..97a6a5892e3b9065fa3a04b5e2f0de588ddfdedf 100644
--- a/BLcourse2.3/01_one_dim.ipynb
+++ b/BLcourse2.3/01_one_dim.ipynb
@@ -10,9 +10,10 @@
     "$\\newcommand{\\ma}[1]{\\mathbf{#1}}$\n",
     "$\\newcommand{\\pred}[1]{\\rm{#1}}$\n",
     "$\\newcommand{\\predve}[1]{\\mathbf{#1}}$\n",
-    "$\\newcommand{\\cov}{\\mathrm{cov}}$\n",
     "$\\newcommand{\\test}[1]{#1_*}$\n",
     "$\\newcommand{\\testtest}[1]{#1_{**}}$\n",
+    "$\\DeclareMathOperator{\\diag}{diag}$\n",
+    "$\\DeclareMathOperator{\\cov}{cov}$\n",
     "\n",
     "Vector $\\ve a\\in\\mathbb R^n$ or $\\mathbb R^{n\\times 1}$, so \"column\" vector.\n",
     "Matrix $\\ma A\\in\\mathbb R^{n\\times m}$. Design matrix with input vectors $\\ve\n",
@@ -59,6 +60,7 @@
     "import gpytorch\n",
     "from matplotlib import pyplot as plt\n",
     "from matplotlib import is_interactive\n",
+    "import numpy as np\n",
     "\n",
     "from utils import extract_model_params, plot_samples\n",
     "\n",
@@ -103,26 +105,41 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def generate_data(x, gaps=[[1, 3]], const=5):\n",
-    "    y = torch.sin(x) * torch.exp(-0.2 * x) + torch.randn(x.shape) * 0.1 + const\n",
+    "def ground_truth(x, const):\n",
+    "    return torch.sin(x) * torch.exp(-0.2 * x) + const\n",
+    "\n",
+    "\n",
+    "def generate_data(x, gaps=[[1, 3]], const=None, noise_std=None):\n",
+    "    noise_dist = torch.distributions.Normal(loc=0, scale=noise_std)\n",
+    "    y = ground_truth(x, const=const) + noise_dist.sample(\n",
+    "        sample_shape=(len(x),)\n",
+    "    )\n",
     "    msk = torch.tensor([True] * len(x))\n",
     "    if gaps is not None:\n",
     "        for g in gaps:\n",
     "            msk = msk & ~((x > g[0]) & (x < g[1]))\n",
-    "    return x[msk], y[msk]\n",
+    "    return x[msk], y[msk], y\n",
     "\n",
     "\n",
+    "const = 5.0\n",
+    "noise_std = 0.1\n",
     "x = torch.linspace(0, 4 * math.pi, 100)\n",
-    "X_train, y_train = generate_data(x, gaps=[[6, 10]])\n",
+    "X_train, y_train, y_gt_train = generate_data(\n",
+    "    x, gaps=[[6, 10]], const=const, noise_std=noise_std\n",
+    ")\n",
     "X_pred = torch.linspace(\n",
     "    X_train[0] - 2, X_train[-1] + 2, 200, requires_grad=False\n",
     ")\n",
+    "y_gt_pred = ground_truth(X_pred, const=const)\n",
     "\n",
     "print(f\"{X_train.shape=}\")\n",
     "print(f\"{y_train.shape=}\")\n",
     "print(f\"{X_pred.shape=}\")\n",
     "\n",
-    "plt.scatter(X_train, y_train, marker=\"o\", color=\"tab:blue\")"
+    "fig, ax = plt.subplots()\n",
+    "ax.scatter(X_train, y_train, marker=\"o\", color=\"tab:blue\", label=\"noisy data\")\n",
+    "ax.plot(X_pred, y_gt_pred, ls=\"--\", color=\"k\", label=\"ground truth\")\n",
+    "ax.legend()"
    ]
   },
   {
@@ -145,7 +162,7 @@
     "`ScaleKernel`.\n",
     "\n",
     "In addition, we define a constant mean via `ConstantMean`. Finally we have\n",
-    "the likelihood noise $\\sigma_n^2$. So in total we have 4 hyper params.\n",
+    "the likelihood variance $\\sigma_n^2$. So in total we have 4 hyper params.\n",
     "\n",
     "* $\\ell$ = `model.covar_module.base_kernel.lengthscale`\n",
     "* $\\sigma_n^2$ = `model.likelihood.noise_covar.noise`\n",
@@ -225,7 +242,7 @@
     "model.mean_module.constant = 3.0\n",
     "model.covar_module.base_kernel.lengthscale = 1.0\n",
     "model.covar_module.outputscale = 1.0\n",
-    "model.likelihood.noise_covar.noise = 0.1\n",
+    "model.likelihood.noise_covar.noise = 1e-3\n",
     "\n",
     "pprint(extract_model_params(model, raw=False))"
    ]
@@ -237,11 +254,16 @@
    "source": [
     "# Sample from the GP prior\n",
     "\n",
-    "We sample a number of functions $f_j, j=1,\\ldots,M$ from the GP prior and\n",
+    "We sample a number of functions $f_m, m=1,\\ldots,M$ from the GP prior and\n",
     "evaluate them at all $\\ma X$ = `X_pred` points, of which we have $N=200$. So\n",
     "we effectively generate samples from $p(\\predve f|\\ma X) = \\mathcal N(\\ve\n",
-    "c, \\ma K)$. Each sampled vector $\\predve f\\in\\mathbb R^{N}$ and the\n",
-    "covariance (kernel) matrix is $\\ma K\\in\\mathbb R^{N\\times N}$."
+    "c, \\ma K)$. Each sampled vector $\\predve f\\in\\mathbb R^{N}$ represents a\n",
+    "sampled *function* $f$ evaluated the $N=200$ points in $\\ma X$. The\n",
+    "covariance (kernel) matrix is $\\ma K\\in\\mathbb R^{N\\times N}$. Its diagonal\n",
+    "$\\diag\\ma K$ = `f_std**2` represents the variance at each point on the $x$-axis.\n",
+    "This is what we plot as \"confidence band\" `f_mean` $\\pm$ `2 * f_std`.\n",
+    "The off-diagonal elements represent the correlation between different points\n",
+    "$K_{ij} = \\cov[f(\\ve x_i), f(\\ve x_j)]$."
    ]
   },
   {
@@ -286,11 +308,12 @@
    "id": "0598b567",
    "metadata": {},
    "source": [
-    "Let's investigate the samples more closely. A constant mean $\\ve m(\\ma X) =\n",
-    "\\ve c$ does *not* mean that each sampled vector $\\predve f$'s mean is\n",
-    "equal to $c$. Instead, we have that at each $\\ve x_i$, the mean of\n",
-    "*all* sampled functions is the same, so $\\frac{1}{M}\\sum_{j=1}^M f_j(\\ve x_i)\n",
-    "\\approx c$ and for $M\\rightarrow\\infty$ it will be exactly $c$.\n"
+    "Let's investigate the samples more closely. First we note that the samples\n",
+    "fluctuate around the mean `model.mean_module.constant` we defined above. A\n",
+    "constant mean $\\ve m(\\ma X) = \\ve c$ does *not* mean that each sampled vector\n",
+    "$\\predve f$'s mean is equal to $c$. Instead, we have that at each $\\ve x_i$,\n",
+    "the mean of *all* sampled functions is the same, so $\\frac{1}{M}\\sum_{j=1}^M\n",
+    "f_m(\\ve x_i) \\approx c$ and for $M\\rightarrow\\infty$ it will be exactly $c$.\n"
    ]
   },
   {
@@ -330,21 +353,20 @@
     "# GP posterior predictive distribution with fixed hyper params\n",
     "\n",
     "Now we calculate the posterior predictive distribution $p(\\test{\\predve\n",
-    "f}|\\test{\\ma X}, \\ma X, \\ve y)$, i.e. we condition on the train data (Bayesian\n",
+    "f}|\\test{\\ma X}, \\ma X, \\ve y) = \\mathcal N(\\test{\\ve\\mu}, \\test{\\ma\\Sigma})$,\n",
+    "i.e. we condition on the train data (Bayesian\n",
     "inference).\n",
     "\n",
     "We use the fixed hyper param values defined above. In particular, since\n",
     "$\\sigma_n^2$ = `model.likelihood.noise_covar.noise` is > 0, we have a\n",
-    "regression setting."
+    "regression setting -- the GP's mean doesn't interpolate all points."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "id": "d01951f1",
-   "metadata": {
-    "lines_to_next_cell": 2
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Evaluation (predictive posterior) mode\n",
@@ -375,6 +397,14 @@
     "        color=\"tab:red\",\n",
     "        lw=2,\n",
     "    )\n",
+    "    ax.plot(\n",
+    "        X_pred.numpy(),\n",
+    "        y_gt_pred.numpy(),\n",
+    "        label=\"ground truth\",\n",
+    "        color=\"k\",\n",
+    "        lw=2,\n",
+    "        ls=\"--\",\n",
+    "    )\n",
     "    ax.fill_between(\n",
     "        X_pred.numpy(),\n",
     "        lower.numpy(),\n",
@@ -395,20 +425,30 @@
    "cell_type": "markdown",
    "id": "b960a745",
    "metadata": {},
+   "source": [
+    "We observe that all sampled functions (green) and the mean (red) tend towards\n",
+    "the low fixed mean function $m(\\ve x)=c$ at 3.0 in the absence of data, while\n",
+    "the actual data mean is `const` from above (data generation). Also the other\n",
+    "hyper params ($\\ell$, $\\sigma_n^2$, $s$) are just guesses. Now we will\n",
+    "calculate their actual value by minimizing the negative log marginal\n",
+    "likelihood."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2de7f663",
+   "metadata": {},
    "source": [
     "# Fit GP to data: optimize hyper params\n",
     "\n",
     "In each step of the optimizer, we condition on the training data (e.g. do\n",
     "Bayesian inference) to calculate the posterior predictive distribution for\n",
-    "the current values of the hyper params. We iterate until the log marginal\n",
+    "the current values of the hyper params. We iterate until the negative log marginal\n",
     "likelihood is converged.\n",
     "\n",
     "We use a simplistic PyTorch-style hand written train loop without convergence\n",
     "control, so make sure to use enough `n_iter` and eyeball-check that the loss\n",
-    "is converged :-)\n",
-    "\n",
-    "Observe how all hyper params converge. In particular, note that the constant\n",
-    "mean $m(\\ve x)=c$ converges to the `const` value in `generate_data()`."
+    "is converged :-)"
    ]
   },
   {
@@ -433,7 +473,7 @@
     "    loss.backward()\n",
     "    optimizer.step()\n",
     "    if (ii + 1) % 10 == 0:\n",
-    "        print(f\"iter {ii+1}/{n_iter}, {loss=:.3f}\")\n",
+    "        print(f\"iter {ii + 1}/{n_iter}, {loss=:.3f}\")\n",
     "    for p_name, p_val in extract_model_params(model).items():\n",
     "        history[p_name].append(p_val)\n",
     "    history[\"loss\"].append(loss.item())"
@@ -446,7 +486,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Plot hyper params and loss (neg. log marginal likelihood) convergence\n",
+    "# Plot hyper params and loss (negative log marginal likelihood) convergence\n",
     "ncols = len(history)\n",
     "fig, axs = plt.subplots(ncols=ncols, nrows=1, figsize=(ncols * 5, 5))\n",
     "for ax, (p_name, p_lst) in zip(axs, history.items()):\n",
@@ -470,20 +510,26 @@
    "cell_type": "markdown",
    "id": "98aefb90",
    "metadata": {},
+   "source": [
+    "We see that all hyper params converge. In particular, note that the constant\n",
+    "mean $m(\\ve x)=c$ converges to the `const` value in `generate_data()`."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b2f6afb0",
+   "metadata": {},
    "source": [
     "# Run prediction\n",
     "\n",
-    "We show \"noiseless\" (left: $\\sigma = \\sqrt{\\mathrm{diag}(\\ma\\Sigma)}$) vs.\n",
-    "\"noisy\" (right: $\\sigma = \\sqrt{\\mathrm{diag}(\\ma\\Sigma + \\sigma_n^2\\,\\ma\n",
-    "I_N)}$) predictions with\n",
+    "We run prediction with two variants of the posterior predictive distribution:\n",
+    "using either only the epistemic uncertainty or using the total uncertainty.\n",
     "\n",
-    "$$\\ma\\Sigma = \\testtest{\\ma K} - \\test{\\ma K}\\,(\\ma K+\\sigma_n^2\\,\\ma I)^{-1}\\,\\test{\\ma K}^\\top$$\n",
-    "\n",
-    "We find that $\\ma\\Sigma$ reflects behavior we would like to see from\n",
-    "epistemic uncertainty -- it is high when we have no data\n",
-    "(out-of-distribution). But this alone isn't the whole story. We need to add\n",
-    "the estimated noise level $\\sigma_n^2$ in order for the confidence band to\n",
-    "cover the data."
+    "* epistemic: $p(\\test{\\predve f}|\\test{\\ma X}, \\ma X, \\ve y) =\n",
+    "  \\mathcal N(\\test{\\ve\\mu}, \\test{\\ma\\Sigma})$ = `post_pred_f` with\n",
+    "  $\\test{\\ma\\Sigma} = \\testtest{\\ma K} - \\test{\\ma K}\\,(\\ma K+\\sigma_n^2\\,\\ma I)^{-1}\\,\\test{\\ma K}^\\top$\n",
+    "* total: $p(\\test{\\predve y}|\\test{\\ma X}, \\ma X, \\ve y) =\n",
+    "  \\mathcal N(\\test{\\ve\\mu}, \\test{\\ma\\Sigma} + \\sigma_n^2\\,\\ma I_N))$ = `post_pred_y`"
    ]
   },
   {
@@ -504,8 +550,13 @@
     "\n",
     "    fig, axs = plt.subplots(ncols=2, figsize=(12, 5))\n",
     "    fig_sigmas, ax_sigmas = plt.subplots()\n",
-    "    for ii, (ax, post_pred, name) in enumerate(\n",
-    "        zip(axs, [post_pred_f, post_pred_y], [\"f\", \"y\"])\n",
+    "    for ii, (ax, post_pred, name, title) in enumerate(\n",
+    "        zip(\n",
+    "            axs,\n",
+    "            [post_pred_f, post_pred_y],\n",
+    "            [\"f\", \"y\"],\n",
+    "            [\"epistemic uncertainty\", \"total uncertainty\"],\n",
+    "        )\n",
     "    ):\n",
     "        yf_mean = post_pred.mean\n",
     "        yf_samples = post_pred.sample(sample_shape=torch.Size((M,)))\n",
@@ -527,6 +578,14 @@
     "            color=\"tab:red\",\n",
     "            lw=2,\n",
     "        )\n",
+    "        ax.plot(\n",
+    "            X_pred.numpy(),\n",
+    "            y_gt_pred.numpy(),\n",
+    "            label=\"ground truth\",\n",
+    "            color=\"k\",\n",
+    "            lw=2,\n",
+    "            ls=\"--\",\n",
+    "        )\n",
     "        ax.fill_between(\n",
     "            X_pred.numpy(),\n",
     "            lower.numpy(),\n",
@@ -535,19 +594,20 @@
     "            color=\"tab:orange\",\n",
     "            alpha=0.3,\n",
     "        )\n",
+    "        ax.set_title(f\"confidence = {title}\")\n",
     "        if name == \"f\":\n",
-    "            sigma_label = r\"$\\pm 2\\sqrt{\\mathrm{diag}(\\Sigma)}$\"\n",
+    "            sigma_label = r\"epistemic: $\\pm 2\\sqrt{\\mathrm{diag}(\\Sigma_*)}$\"\n",
     "            zorder = 1\n",
     "        else:\n",
     "            sigma_label = (\n",
-    "                r\"$\\pm 2\\sqrt{\\mathrm{diag}(\\Sigma + \\sigma_n^2\\,I)}$\"\n",
+    "                r\"total: $\\pm 2\\sqrt{\\mathrm{diag}(\\Sigma_* + \\sigma_n^2\\,I)}$\"\n",
     "            )\n",
     "            zorder = 0\n",
     "        ax_sigmas.fill_between(\n",
     "            X_pred.numpy(),\n",
     "            lower.numpy(),\n",
     "            upper.numpy(),\n",
-    "            label=\"confidence \" + sigma_label,\n",
+    "            label=sigma_label,\n",
     "            color=\"tab:orange\" if name == \"f\" else \"tab:blue\",\n",
     "            alpha=0.5,\n",
     "            zorder=zorder,\n",
@@ -559,9 +619,60 @@
     "        plot_samples(ax, X_pred, yf_samples, label=\"posterior pred. samples\")\n",
     "        if ii == 1:\n",
     "            ax.legend()\n",
+    "    ax_sigmas.set_title(\"total vs. epistemic uncertainty\")\n",
     "    ax_sigmas.legend()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "4c52a919",
+   "metadata": {},
+   "source": [
+    "We find that $\\test{\\ma\\Sigma}$ reflects behavior we would like to see from\n",
+    "epistemic uncertainty -- it is high when we have no data\n",
+    "(out-of-distribution). But this alone isn't the whole story. We need to add\n",
+    "the estimated likelihood variance $\\sigma_n^2$ in order for the confidence\n",
+    "band to cover the data -- this turns the estimated total uncertainty into a\n",
+    "*calibrated* uncertainty."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8823deb0",
+   "metadata": {},
+   "source": [
+    "# Let's check the learned noise\n",
+    "\n",
+    "We compare the target data noise (`noise_std`) to the learned GP noise, in\n",
+    "the form of the likelihood *standard deviation* $\\sigma_n$. The latter is\n",
+    "equal to the $\\sqrt{\\cdot}$ of `likelihood.noise_covar.noise` and can also be\n",
+    "calculated via $\\sqrt{\\diag(\\test{\\ma\\Sigma} + \\sigma_n^2\\,\\ma I_N) -\n",
+    "\\diag(\\test{\\ma\\Sigma}})$."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d7cd1c02",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Target noise to learn\n",
+    "print(\"data noise:\", noise_std)\n",
+    "\n",
+    "# The two below must be the same\n",
+    "print(\n",
+    "    \"learned noise:\",\n",
+    "    (post_pred_y.stddev**2 - post_pred_f.stddev**2).mean().sqrt().item(),\n",
+    ")\n",
+    "print(\n",
+    "    \"learned noise:\",\n",
+    "    np.sqrt(\n",
+    "        extract_model_params(model, raw=False)[\"likelihood.noise_covar.noise\"]\n",
+    "    ),\n",
+    ")"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -580,9 +691,9 @@
    "formats": "ipynb,py:light"
   },
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "bayes-ml-course",
    "language": "python",
-   "name": "python3"
+   "name": "bayes-ml-course"
   }
  },
  "nbformat": 4,
diff --git a/BLcourse2.3/01_one_dim.py b/BLcourse2.3/01_one_dim.py
index 39150d5179d539280a5dddaa5cb1731a3860c376..52a76287f5f7d8a6d3ed1dc273c151252ea35fdc 100644
--- a/BLcourse2.3/01_one_dim.py
+++ b/BLcourse2.3/01_one_dim.py
@@ -18,9 +18,10 @@
 # $\newcommand{\ma}[1]{\mathbf{#1}}$
 # $\newcommand{\pred}[1]{\rm{#1}}$
 # $\newcommand{\predve}[1]{\mathbf{#1}}$
-# $\newcommand{\cov}{\mathrm{cov}}$
 # $\newcommand{\test}[1]{#1_*}$
 # $\newcommand{\testtest}[1]{#1_{**}}$
+# $\DeclareMathOperator{\diag}{diag}$
+# $\DeclareMathOperator{\cov}{cov}$
 #
 # Vector $\ve a\in\mathbb R^n$ or $\mathbb R^{n\times 1}$, so "column" vector.
 # Matrix $\ma A\in\mathbb R^{n\times m}$. Design matrix with input vectors $\ve
@@ -185,11 +186,16 @@ pprint(extract_model_params(model, raw=False))
 
 # # Sample from the GP prior
 #
-# We sample a number of functions $f_j, j=1,\ldots,M$ from the GP prior and
+# We sample a number of functions $f_m, m=1,\ldots,M$ from the GP prior and
 # evaluate them at all $\ma X$ = `X_pred` points, of which we have $N=200$. So
 # we effectively generate samples from $p(\predve f|\ma X) = \mathcal N(\ve
-# c, \ma K)$. Each sampled vector $\predve f\in\mathbb R^{N}$ and the
-# covariance (kernel) matrix is $\ma K\in\mathbb R^{N\times N}$.
+# c, \ma K)$. Each sampled vector $\predve f\in\mathbb R^{N}$ represents a
+# sampled *function* $f$ evaluated the $N=200$ points in $\ma X$. The
+# covariance (kernel) matrix is $\ma K\in\mathbb R^{N\times N}$. Its diagonal
+# $\diag\ma K$ = `f_std**2` represents the variance at each point on the $x$-axis.
+# This is what we plot as "confidence band" `f_mean` $\pm$ `2 * f_std`.
+# The off-diagonal elements represent the correlation between different points
+# $K_{ij} = \cov[f(\ve x_i), f(\ve x_j)]$.
 
 # +
 model.eval()
@@ -226,7 +232,7 @@ with torch.no_grad():
 # constant mean $\ve m(\ma X) = \ve c$ does *not* mean that each sampled vector
 # $\predve f$'s mean is equal to $c$. Instead, we have that at each $\ve x_i$,
 # the mean of *all* sampled functions is the same, so $\frac{1}{M}\sum_{j=1}^M
-# f_j(\ve x_i) \approx c$ and for $M\rightarrow\infty$ it will be exactly $c$.
+# f_m(\ve x_i) \approx c$ and for $M\rightarrow\infty$ it will be exactly $c$.
 #
 
 # Look at the first 20 x points from M=10 samples
@@ -245,7 +251,8 @@ print(f"{f_samples.mean(axis=0).std()=}")
 # # GP posterior predictive distribution with fixed hyper params
 #
 # Now we calculate the posterior predictive distribution $p(\test{\predve
-# f}|\test{\ma X}, \ma X, \ve y)$, i.e. we condition on the train data (Bayesian
+# f}|\test{\ma X}, \ma X, \ve y) = \mathcal N(\test{\ve\mu}, \test{\ma\Sigma})$,
+# i.e. we condition on the train data (Bayesian
 # inference).
 #
 # We use the fixed hyper param values defined above. In particular, since
@@ -361,11 +368,14 @@ pprint(extract_model_params(model, raw=False))
 
 # # Run prediction
 #
-# We show "noiseless" (left: $\sigma = \sqrt{\mathrm{diag}(\ma\Sigma)}$) vs.
-# "noisy" (right: $\sigma = \sqrt{\mathrm{diag}(\ma\Sigma + \sigma_n^2\,\ma
-# I_N)}$) predictions with
+# We run prediction with two variants of the posterior predictive distribution:
+# using either only the epistemic uncertainty or using the total uncertainty.
 #
-# $$\ma\Sigma = \testtest{\ma K} - \test{\ma K}\,(\ma K+\sigma_n^2\,\ma I)^{-1}\,\test{\ma K}^\top$$
+# * epistemic: $p(\test{\predve f}|\test{\ma X}, \ma X, \ve y) =
+#   \mathcal N(\test{\ve\mu}, \test{\ma\Sigma})$ = `post_pred_f` with
+#   $\test{\ma\Sigma} = \testtest{\ma K} - \test{\ma K}\,(\ma K+\sigma_n^2\,\ma I)^{-1}\,\test{\ma K}^\top$
+# * total: $p(\test{\predve y}|\test{\ma X}, \ma X, \ve y) =
+#   \mathcal N(\test{\ve\mu}, \test{\ma\Sigma} + \sigma_n^2\,\ma I_N))$ = `post_pred_y`
 
 # +
 # Evaluation (predictive posterior) mode
@@ -379,8 +389,13 @@ with torch.no_grad():
 
     fig, axs = plt.subplots(ncols=2, figsize=(12, 5))
     fig_sigmas, ax_sigmas = plt.subplots()
-    for ii, (ax, post_pred, name) in enumerate(
-        zip(axs, [post_pred_f, post_pred_y], ["f", "y"])
+    for ii, (ax, post_pred, name, title) in enumerate(
+        zip(
+            axs,
+            [post_pred_f, post_pred_y],
+            ["f", "y"],
+            ["epistemic uncertainty", "total uncertainty"],
+        )
     ):
         yf_mean = post_pred.mean
         yf_samples = post_pred.sample(sample_shape=torch.Size((M,)))
@@ -418,19 +433,20 @@ with torch.no_grad():
             color="tab:orange",
             alpha=0.3,
         )
+        ax.set_title(f"confidence = {title}")
         if name == "f":
-            sigma_label = r"$\pm 2\sqrt{\mathrm{diag}(\Sigma)}$"
+            sigma_label = r"epistemic: $\pm 2\sqrt{\mathrm{diag}(\Sigma_*)}$"
             zorder = 1
         else:
             sigma_label = (
-                r"$\pm 2\sqrt{\mathrm{diag}(\Sigma + \sigma_n^2\,I)}$"
+                r"total: $\pm 2\sqrt{\mathrm{diag}(\Sigma_* + \sigma_n^2\,I)}$"
             )
             zorder = 0
         ax_sigmas.fill_between(
             X_pred.numpy(),
             lower.numpy(),
             upper.numpy(),
-            label="confidence " + sigma_label,
+            label=sigma_label,
             color="tab:orange" if name == "f" else "tab:blue",
             alpha=0.5,
             zorder=zorder,
@@ -442,10 +458,11 @@ with torch.no_grad():
         plot_samples(ax, X_pred, yf_samples, label="posterior pred. samples")
         if ii == 1:
             ax.legend()
+    ax_sigmas.set_title("total vs. epistemic uncertainty")
     ax_sigmas.legend()
 # -
 
-# We find that $\ma\Sigma$ reflects behavior we would like to see from
+# We find that $\test{\ma\Sigma}$ reflects behavior we would like to see from
 # epistemic uncertainty -- it is high when we have no data
 # (out-of-distribution). But this alone isn't the whole story. We need to add
 # the estimated likelihood variance $\sigma_n^2$ in order for the confidence
@@ -457,8 +474,8 @@ with torch.no_grad():
 # We compare the target data noise (`noise_std`) to the learned GP noise, in
 # the form of the likelihood *standard deviation* $\sigma_n$. The latter is
 # equal to the $\sqrt{\cdot}$ of `likelihood.noise_covar.noise` and can also be
-# calculated via $\sqrt{\mathrm{diag}(\ma\Sigma + \sigma_n^2\,\ma I_N) -
-# \mathrm{diag}(\ma\Sigma)}$.
+# calculated via $\sqrt{\diag(\test{\ma\Sigma} + \sigma_n^2\,\ma I_N) -
+# \diag(\test{\ma\Sigma}})$.
 
 # +
 # Target noise to learn
diff --git a/BLcourse2.3/02_two_dim.ipynb b/BLcourse2.3/02_two_dim.ipynb
index 403c6d74f7517e297d3732edea044cf2185081f9..fdebd011672d5e62a2939579ae935f2658cd5463 100644
--- a/BLcourse2.3/02_two_dim.ipynb
+++ b/BLcourse2.3/02_two_dim.ipynb
@@ -1,5 +1,13 @@
 {
  "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "8c19b309",
+   "metadata": {},
+   "source": [
+    "In this notebook, we use a GP to fit a 2D data set."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -146,6 +154,14 @@
     "# Exercise 1"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "c860aa4f",
+   "metadata": {},
+   "source": [
+    "Keep the settings below and explore the notebook till the end first."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -165,15 +181,23 @@
     "# Exercise 2"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "9f0c3b31",
+   "metadata": {},
+   "source": [
+    "First complete the notebook as is, then come back here."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "6bec0f58",
+   "id": "5132eaeb",
    "metadata": {},
    "outputs": [],
    "source": [
-    "##use_noise = True\n",
-    "##use_gap = False"
+    "##use_noise = False\n",
+    "##use_gap = True"
    ]
   },
   {
@@ -184,21 +208,29 @@
     "# Exercise 3"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "bfd5d87b",
+   "metadata": {},
+   "source": [
+    "First complete the notebook with Exercise 2, then come back here."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "5132eaeb",
+   "id": "f27dce89",
    "metadata": {},
    "outputs": [],
    "source": [
-    "##use_noise = False\n",
+    "##use_noise = True\n",
     "##use_gap = True"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "6efe09b3",
+   "id": "cd8ac983",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -206,7 +238,9 @@
     "    # noisy train data\n",
     "    noise_std = 0.2\n",
     "    noise_dist = torch.distributions.Normal(loc=0, scale=noise_std)\n",
-    "    y_train = data_train.z + noise_dist.sample_n(len(data_train.z))\n",
+    "    y_train = data_train.z + noise_dist.sample(\n",
+    "        sample_shape=(len(data_train.z),)\n",
+    "    )\n",
     "else:\n",
     "    # noise-free train data\n",
     "    noise_std = 0\n",
@@ -216,11 +250,12 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "af637911",
+   "id": "59f6f6d6",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Cut out part of the train data to create out-of-distribution predictions\n",
+    "# Cut out part of the train data to create out-of-distribution predictions.\n",
+    "# Same as the \"gaps\" we created in the 1D case.\n",
     "\n",
     "if use_gap:\n",
     "    mask = (X_train[:, 0] > 0) & (X_train[:, 1] < 0)\n",
@@ -356,10 +391,10 @@
     "model.train()\n",
     "likelihood.train()\n",
     "\n",
-    "optimizer = torch.optim.Adam(model.parameters(), lr=0.2)\n",
+    "optimizer = torch.optim.Adam(model.parameters(), lr=0.15)\n",
     "loss_func = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)\n",
     "\n",
-    "n_iter = 300\n",
+    "n_iter = 400\n",
     "history = defaultdict(list)\n",
     "for ii in range(n_iter):\n",
     "    optimizer.zero_grad()\n",
@@ -367,7 +402,7 @@
     "    loss.backward()\n",
     "    optimizer.step()\n",
     "    if (ii + 1) % 10 == 0:\n",
-    "        print(f\"iter {ii+1}/{n_iter}, {loss=:.3f}\")\n",
+    "        print(f\"iter {ii + 1}/{n_iter}, {loss=:.3f}\")\n",
     "    for p_name, p_val in extract_model_params(model).items():\n",
     "        history[p_name].append(p_val)\n",
     "    history[\"loss\"].append(loss.item())"
@@ -411,9 +446,7 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "59e18623",
-   "metadata": {
-    "lines_to_next_cell": 2
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "model.eval()\n",
@@ -445,6 +478,16 @@
     "assert (post_pred_f.mean == post_pred_y.mean).all()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "bff55240",
+   "metadata": {},
+   "source": [
+    "When `use_noise=False`, then the GP's prediction is an almost pefect\n",
+    "reconstruction of the ground truth function (in-distribution, so where we\n",
+    "have data)."
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "591e453d",
@@ -460,7 +503,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ncols = 3\n",
+    "ncols = 4\n",
     "fig, axs = plt.subplots(ncols=ncols, nrows=1, figsize=(ncols * 7, 5))\n",
     "\n",
     "vmax = post_pred_y.stddev.max()\n",
@@ -477,27 +520,42 @@
     ")\n",
     "axs[0].set_title(\"|y_pred - y_true|\")\n",
     "\n",
+    "f_std = post_pred_f.stddev.reshape((data_pred.nx, data_pred.ny))\n",
+    "y_std = post_pred_y.stddev.reshape((data_pred.nx, data_pred.ny))\n",
+    "\n",
     "cs.append(\n",
     "    axs[1].contourf(\n",
     "        data_pred.XG,\n",
     "        data_pred.YG,\n",
-    "        post_pred_f.stddev.reshape((data_pred.nx, data_pred.ny)),\n",
+    "        f_std,\n",
     "        vmin=0,\n",
     "        vmax=vmax,\n",
     "    )\n",
     ")\n",
-    "axs[1].set_title(\"f_std (epistemic)\")\n",
+    "axs[1].set_title(\"epistemic: f_std\")\n",
     "\n",
     "cs.append(\n",
     "    axs[2].contourf(\n",
     "        data_pred.XG,\n",
     "        data_pred.YG,\n",
-    "        post_pred_y.stddev.reshape((data_pred.nx, data_pred.ny)),\n",
+    "        y_std,\n",
     "        vmin=0,\n",
     "        vmax=vmax,\n",
     "    )\n",
     ")\n",
-    "axs[2].set_title(\"y_std (epistemic + aleatoric)\")\n",
+    "axs[2].set_title(\"total: y_std\")\n",
+    "\n",
+    "cs.append(\n",
+    "    axs[3].contourf(\n",
+    "        data_pred.XG,\n",
+    "        data_pred.YG,\n",
+    "        y_std - f_std,\n",
+    "        vmin=0,\n",
+    "        cmap=\"plasma\",\n",
+    "        ##vmax=vmax,\n",
+    "    )\n",
+    ")\n",
+    "axs[3].set_title(\"aleatoric: y_std - f_std\")\n",
     "\n",
     "for ax, c in zip(axs, cs):\n",
     "    ax.set_xlabel(\"X_0\")\n",
@@ -511,7 +569,7 @@
    "id": "04257cea",
    "metadata": {},
    "source": [
-    "# Check learned noise"
+    "## Let's check the learned noise"
    ]
   },
   {
@@ -521,13 +579,20 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "print((post_pred_y.stddev**2 - post_pred_f.stddev**2).mean().sqrt())\n",
+    "# Target noise to learn\n",
+    "print(\"data noise:\", noise_std)\n",
+    "\n",
+    "# The two below must be the same\n",
     "print(\n",
+    "    \"learned noise:\",\n",
+    "    (post_pred_y.stddev**2 - post_pred_f.stddev**2).mean().sqrt().item(),\n",
+    ")\n",
+    "print(\n",
+    "    \"learned noise:\",\n",
     "    np.sqrt(\n",
     "        extract_model_params(model, raw=False)[\"likelihood.noise_covar.noise\"]\n",
-    "    )\n",
-    ")\n",
-    "print(noise_std)"
+    "    ),\n",
+    ")"
    ]
   },
   {
@@ -535,7 +600,71 @@
    "id": "1da209ff",
    "metadata": {},
    "source": [
-    "# Plot confidence bands"
+    "# Observations"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c52e65c1",
+   "metadata": {},
+   "source": [
+    "We have the following terms:\n",
+    "\n",
+    "* epistemic: `f_std` = $\\sqrt{\\diag\\test{\\ma\\Sigma}}$\n",
+    "* total: `y_std` = $\\sqrt{\\diag(\\test{\\ma\\Sigma} + \\sigma_n^2\\,\\ma I_N)}$\n",
+    "* aleatoric: we have two ways of representing it\n",
+    "  * from the likelihood: $\\sigma_n$\n",
+    "  * for plotting: we use `y_std` - `f_std`, this is $\\neq \\sigma_n$ because of the $\\sqrt{\\cdot}$\n",
+    "    above\n",
+    "\n",
+    "We can make the following observations:\n",
+    "\n",
+    "* Exercise 1: `use_noise=False`, `use_gap=False`\n",
+    "  * The epistemic uncertainty `f_std` is a good indicator\n",
+    "    of the (small) differences between model prediction and ground truth\n",
+    "  * The learned variance $\\sigma_n^2$, and hence the aleatoric uncertainty is\n",
+    "    near zero, which makes sense for noise-free data\n",
+    "* Exercise 2: `use_noise=False`, `use_gap=True`\n",
+    "  * When faced with out-of-distribution (OOD) data, the epistemic `f_std`\n",
+    "    clearly shows where the model will make wrong (less trustworthy)\n",
+    "    predictions\n",
+    "* Exercise 3: `use_noise=True`, `use_gap=True`\n",
+    "  * in-distribution (where we have data)\n",
+    "    * The distinction between\n",
+    "      epistemic and aleatoric in the way we define it is less meaningful,\n",
+    "      hence, `f_std` doesn't correlate well with `y_pred - y_true`. The\n",
+    "      reason is that the noise $\\sigma_n$ shows up in two parts: (a) in the\n",
+    "      equation of $\\test{\\ma\\Sigma}$ itself, so the \"epistemic\" uncertainty\n",
+    "      `f_std` = $\\sqrt{\\diag\\test{\\ma\\Sigma}}$ is bigger just because we have\n",
+    "      noise (regression) and (b) we add it in $\\sqrt{\\diag(\\test{\\ma\\Sigma} +\n",
+    "      \\sigma_n^2\\,\\ma I_N)}$ to get the total `y_std`\n",
+    "    * We learn the value of `noise_std` ($\\sigma_n$) quite well and add **its\n",
+    "      square** as a constant ($\\test{\\ma\\Sigma} + \\sigma_n^2\\,\\ma I_N$). The\n",
+    "      `y_std` plot looks like the `f_std` one, but shifted by a constant. But\n",
+    "      this is not the case because we compare standard deviations and not\n",
+    "      variances, hence `y_std` - `f_std` is not constant, and in particular\n",
+    "      $\\neq \\sigma_n$, but both are in the same numerical range (0.15 vs.\n",
+    "      0.2).\n",
+    "  * out-of-distribution: `f_std` (epistemic) dominates"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "08e949a2",
+   "metadata": {},
+   "source": [
+    "Exercises\n",
+    "\n",
+    "Go back up, switch on the settings for Exercise 2 and re-run the notebook.\n",
+    "Same with Exercise 3."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ee89f26a",
+   "metadata": {},
+   "source": [
+    "# Bonus: plot confidence bands"
    ]
   },
   {
diff --git a/BLcourse2.3/02_two_dim.py b/BLcourse2.3/02_two_dim.py
index 388947a2020d9484e6e87d03e6c33b4f4a62b340..01bda27d6596781714439a2be7ed8073707505c4 100644
--- a/BLcourse2.3/02_two_dim.py
+++ b/BLcourse2.3/02_two_dim.py
@@ -386,8 +386,8 @@ print(
 
 # We have the following terms:
 #
-# * epistemic: `f_std` = $\sqrt{\mathrm{diag}\,\ma\Sigma}$
-# * total: `y_std` = $\sqrt{\mathrm{diag}(\ma\Sigma + \sigma_n^2\,\ma I_N)}$
+# * epistemic: `f_std` = $\sqrt{\diag\test{\ma\Sigma}}$
+# * total: `y_std` = $\sqrt{\diag(\test{\ma\Sigma} + \sigma_n^2\,\ma I_N)}$
 # * aleatoric: we have two ways of representing it
 #   * from the likelihood: $\sigma_n$
 #   * for plotting: we use `y_std` - `f_std`, this is $\neq \sigma_n$ because of the $\sqrt{\cdot}$
@@ -398,26 +398,29 @@ print(
 # * Exercise 1: `use_noise=False`, `use_gap=False`
 #   * The epistemic uncertainty `f_std` is a good indicator
 #     of the (small) differences between model prediction and ground truth
-#   * The learned variance $\sigma_n^2$, and hence the aleatoric uncertainty is near zero, which makes sense for noise-free data
+#   * The learned variance $\sigma_n^2$, and hence the aleatoric uncertainty is
+#     near zero, which makes sense for noise-free data
 # * Exercise 2: `use_noise=False`, `use_gap=True`
-#   * When faced with out-of-distribution (OOD) data, the epistemic `f_std` clearly shows where the
-#     model will make wrong (less trustworthy) predictions
+#   * When faced with out-of-distribution (OOD) data, the epistemic `f_std`
+#     clearly shows where the model will make wrong (less trustworthy)
+#     predictions
 # * Exercise 3: `use_noise=True`, `use_gap=True`
 #   * in-distribution (where we have data)
 #     * The distinction between
-#       epistemic and aleatoric in the way we define it is less meaningful, hence, `f_std` doesn't
-#       correlate well with `y_pred - y_true`. The reason is that the noise
-#       $\sigma_n$ shows up in two parts: (a) in the equation of $\ma\Sigma$
-#       itself, so the "epistemic" uncertainty `f_std` = $\sqrt{\mathrm{diag}\,\ma\Sigma}$
-#       is bigger just because we have noise (regression) and (b) we add
-#       it in $\sqrt{\mathrm{diag}(\ma\Sigma + \sigma_n^2\,\ma I_N)}$
-#       to get the total `y_std`
-#     * We learn the value of `noise_std` ($\sigma_n$) quite well and add **its square** as a constant ($\ma\Sigma + \sigma_n^2\,\ma I_N$).
-#       The `y_std` plot looks like the `f_std` one, but shifted by a constant. But this is not the case because
-#       we compare standard deviations and
-#       not variances, hence `y_std` - `f_std` is not constant, and in
-#       particular $\neq \sigma_n$, but both are in the same numerical range
-#       (0.15 vs. 0.2).
+#       epistemic and aleatoric in the way we define it is less meaningful,
+#       hence, `f_std` doesn't correlate well with `y_pred - y_true`. The
+#       reason is that the noise $\sigma_n$ shows up in two parts: (a) in the
+#       equation of $\test{\ma\Sigma}$ itself, so the "epistemic" uncertainty
+#       `f_std` = $\sqrt{\diag\test{\ma\Sigma}}$ is bigger just because we have
+#       noise (regression) and (b) we add it in $\sqrt{\diag(\test{\ma\Sigma} +
+#       \sigma_n^2\,\ma I_N)}$ to get the total `y_std`
+#     * We learn the value of `noise_std` ($\sigma_n$) quite well and add **its
+#       square** as a constant ($\test{\ma\Sigma} + \sigma_n^2\,\ma I_N$). The
+#       `y_std` plot looks like the `f_std` one, but shifted by a constant. But
+#       this is not the case because we compare standard deviations and not
+#       variances, hence `y_std` - `f_std` is not constant, and in particular
+#       $\neq \sigma_n$, but both are in the same numerical range (0.15 vs.
+#       0.2).
 #   * out-of-distribution: `f_std` (epistemic) dominates
 
 # Exercises