diff --git a/BLcourse2.3/03_one_dim_SVI.ipynb b/BLcourse2.3/03_one_dim_SVI.ipynb
index 7cfaa301275ec7638ce72dd05df8683fc9d7d265..f60508eba28f67c8992c710c4d837f9ba1187282 100644
--- a/BLcourse2.3/03_one_dim_SVI.ipynb
+++ b/BLcourse2.3/03_one_dim_SVI.ipynb
@@ -17,7 +17,8 @@
     "$\\newcommand{\\predve}[1]{\\mathbf{#1}}$\n",
     "$\\newcommand{\\test}[1]{#1_*}$\n",
     "$\\newcommand{\\testtest}[1]{#1_{**}}$\n",
-    "$\\newcommand{\\dd}{\\rm{d}}$\n",
+    "$\\newcommand{\\dd}{{\\rm{d}}}$\n",
+    "$\\newcommand{\\lt}[1]{_{\\text{#1}}}$\n",
     "$\\DeclareMathOperator{\\diag}{diag}$\n",
     "$\\DeclareMathOperator{\\cov}{cov}$"
    ]
@@ -264,14 +265,31 @@
    "source": [
     "# Fit GP to data: optimize hyper params\n",
     "\n",
-    "Now we optimize the GP hyper parameters by doing a GP-specific variational inference (VI),\n",
-    "where we optimize not the log marginal likelihood (ExactGP case),\n",
-    "but an ELBO (evidence lower bound) objective. The latter is a proxy for minimizing\n",
-    "the KL divergence between distributions, which in our case are the approximate\n",
+    "Now we optimize the GP hyper parameters by doing a GP-specific variational\n",
+    "inference (VI), where we don't maximize the log marginal likelihood (ExactGP\n",
+    "case), but an ELBO (\"evidence lower bound\") objective -- a lower bound on the\n",
+    "marginal likelihood (the \"evidence\"). In variational inference, an ELBO objective\n",
+    "shows up when minimizing the KL divergence between\n",
+    "an approximate and the true posterior\n",
+    "\n",
+    "$$\n",
+    "    p(w|y) = \\frac{p(y|w)\\,p(w)}{\\int p(y|w)\\,p(w)\\,\\dd w}\n",
+    "           = \\frac{p(y|w)\\,p(w)}{p(y)}\n",
+    "$$\n",
+    "\n",
+    "$$\n",
+    "  \\ve\\zeta^* = \\text{arg}\\min_{\\ve\\zeta} D\\lt{KL}(q_{\\ve\\zeta}(w)\\,\\Vert\\, p(w|y))\n",
+    "$$\n",
+    "\n",
+    "to obtain the optimal variational parameters $\\ve\\zeta^*$ to approximate\n",
+    "$p(w|y)$ with $q_{\\ve\\zeta^*}(w)$.\n",
+    "\n",
+    "In our case the two distributions are the approximate\n",
     "\n",
     "$$q_{\\ve\\zeta}(\\mathbf f)=\\int p(\\mathbf f|\\ve u)\\,q_{\\ve\\psi}(\\ve u)\\,\\dd\\ve u\\quad(\\text{\"variational strategy\"})$$\n",
     "\n",
-    "and the true $p(\\mathbf f|\\mathcal D)$ posterior over function values. We optimize with respect to\n",
+    "and the true $p(\\mathbf f|\\mathcal D)$ posterior over function values. We\n",
+    "optimize with respect to\n",
     "\n",
     "$$\\ve\\zeta = [\\ell, \\sigma_n^2, s, c, \\ve\\psi] $$\n",
     "\n",
diff --git a/BLcourse2.3/03_one_dim_SVI.py b/BLcourse2.3/03_one_dim_SVI.py
index 27cb1fe3382be05ab552ec4fb32ff03808d4e53f..8f34d40034dbda1a5a20c03fe4b1e55608d26e81 100644
--- a/BLcourse2.3/03_one_dim_SVI.py
+++ b/BLcourse2.3/03_one_dim_SVI.py
@@ -25,7 +25,8 @@
 # $\newcommand{\predve}[1]{\mathbf{#1}}$
 # $\newcommand{\test}[1]{#1_*}$
 # $\newcommand{\testtest}[1]{#1_{**}}$
-# $\newcommand{\dd}{\rm{d}}$
+# $\newcommand{\dd}{{\rm{d}}}$
+# $\newcommand{\lt}[1]{_{\text{#1}}}$
 # $\DeclareMathOperator{\diag}{diag}$
 # $\DeclareMathOperator{\cov}{cov}$
 
@@ -187,14 +188,31 @@ likelihood.noise_covar.noise = 0.3
 
 # # Fit GP to data: optimize hyper params
 #
-# Now we optimize the GP hyper parameters by doing a GP-specific variational inference (VI),
-# where we optimize not the log marginal likelihood (ExactGP case),
-# but an ELBO (evidence lower bound) objective. The latter is a proxy for minimizing
-# the KL divergence between distributions, which in our case are the approximate
+# Now we optimize the GP hyper parameters by doing a GP-specific variational
+# inference (VI), where we don't maximize the log marginal likelihood (ExactGP
+# case), but an ELBO ("evidence lower bound") objective -- a lower bound on the
+# marginal likelihood (the "evidence"). In variational inference, an ELBO objective
+# shows up when minimizing the KL divergence between
+# an approximate and the true posterior
+#
+# $$
+#     p(w|y) = \frac{p(y|w)\,p(w)}{\int p(y|w)\,p(w)\,\dd w}
+#            = \frac{p(y|w)\,p(w)}{p(y)}
+# $$
+#
+# $$
+#   \ve\zeta^* = \text{arg}\min_{\ve\zeta} D\lt{KL}(q_{\ve\zeta}(w)\,\Vert\, p(w|y))
+# $$
+#
+# to obtain the optimal variational parameters $\ve\zeta^*$ to approximate
+# $p(w|y)$ with $q_{\ve\zeta^*}(w)$.
+#
+# In our case the two distributions are the approximate
 #
 # $$q_{\ve\zeta}(\mathbf f)=\int p(\mathbf f|\ve u)\,q_{\ve\psi}(\ve u)\,\dd\ve u\quad(\text{"variational strategy"})$$
 #
-# and the true $p(\mathbf f|\mathcal D)$ posterior over function values. We optimize with respect to
+# and the true $p(\mathbf f|\mathcal D)$ posterior over function values. We
+# optimize with respect to
 #
 # $$\ve\zeta = [\ell, \sigma_n^2, s, c, \ve\psi] $$
 #