From 6e4ff1eefdecf31c4dc82658565aa36cae6fa6fe Mon Sep 17 00:00:00 2001 From: Steve Schmerler <git@elcorto.com> Date: Wed, 21 May 2025 17:19:01 +0200 Subject: [PATCH] Update 03_one_dim_SVI --- BLcourse2.3/03_one_dim_SVI.ipynb | 14 ++++++++------ BLcourse2.3/03_one_dim_SVI.py | 14 ++++++++------ 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/BLcourse2.3/03_one_dim_SVI.ipynb b/BLcourse2.3/03_one_dim_SVI.ipynb index 0520fa0..7cfaa30 100644 --- a/BLcourse2.3/03_one_dim_SVI.ipynb +++ b/BLcourse2.3/03_one_dim_SVI.ipynb @@ -17,6 +17,7 @@ "$\\newcommand{\\predve}[1]{\\mathbf{#1}}$\n", "$\\newcommand{\\test}[1]{#1_*}$\n", "$\\newcommand{\\testtest}[1]{#1_{**}}$\n", + "$\\newcommand{\\dd}{\\rm{d}}$\n", "$\\DeclareMathOperator{\\diag}{diag}$\n", "$\\DeclareMathOperator{\\cov}{cov}$" ] @@ -141,6 +142,9 @@ "2015](https://proceedings.mlr.press/v38/hensman15.html). The model is\n", "\"sparse\" since it works with a set of *inducing* points $(\\ma Z, \\ve u),\n", "\\ve u=f(\\ma Z)$ which is much smaller than the train data $(\\ma X, \\ve y)$.\n", + "See also [the GPJax\n", + "docs](https://docs.jaxgaussianprocesses.com/_examples/uncollapsed_vi) for a\n", + "nice introduction.\n", "\n", "We have the same hyper parameters as before\n", "\n", @@ -262,14 +266,12 @@ "\n", "Now we optimize the GP hyper parameters by doing a GP-specific variational inference (VI),\n", "where we optimize not the log marginal likelihood (ExactGP case),\n", - "but an ELBO (evidence lower bound) objective\n", + "but an ELBO (evidence lower bound) objective. The latter is a proxy for minimizing\n", + "the KL divergence between distributions, which in our case are the approximate\n", "\n", - "$$\n", - "\\max_\\ve\\zeta\\left(\\mathbb E_{q_{\\ve\\psi}(\\ve u)}\\left[\\ln p(\\ve y|\\ve u) \\right] -\n", - "D_{\\text{KL}}(q_{\\ve\\psi}(\\ve u)\\Vert p(\\ve u))\\right)\n", - "$$\n", + "$$q_{\\ve\\zeta}(\\mathbf f)=\\int p(\\mathbf f|\\ve u)\\,q_{\\ve\\psi}(\\ve u)\\,\\dd\\ve u\\quad(\\text{\"variational strategy\"})$$\n", "\n", - "with respect to\n", + "and the true $p(\\mathbf f|\\mathcal D)$ posterior over function values. We optimize with respect to\n", "\n", "$$\\ve\\zeta = [\\ell, \\sigma_n^2, s, c, \\ve\\psi] $$\n", "\n", diff --git a/BLcourse2.3/03_one_dim_SVI.py b/BLcourse2.3/03_one_dim_SVI.py index 644e61a..27cb1fe 100644 --- a/BLcourse2.3/03_one_dim_SVI.py +++ b/BLcourse2.3/03_one_dim_SVI.py @@ -25,6 +25,7 @@ # $\newcommand{\predve}[1]{\mathbf{#1}}$ # $\newcommand{\test}[1]{#1_*}$ # $\newcommand{\testtest}[1]{#1_{**}}$ +# $\newcommand{\dd}{\rm{d}}$ # $\DeclareMathOperator{\diag}{diag}$ # $\DeclareMathOperator{\cov}{cov}$ @@ -108,6 +109,9 @@ ax.legend() # 2015](https://proceedings.mlr.press/v38/hensman15.html). The model is # "sparse" since it works with a set of *inducing* points $(\ma Z, \ve u), # \ve u=f(\ma Z)$ which is much smaller than the train data $(\ma X, \ve y)$. +# See also [the GPJax +# docs](https://docs.jaxgaussianprocesses.com/_examples/uncollapsed_vi) for a +# nice introduction. # # We have the same hyper parameters as before # @@ -185,14 +189,12 @@ likelihood.noise_covar.noise = 0.3 # # Now we optimize the GP hyper parameters by doing a GP-specific variational inference (VI), # where we optimize not the log marginal likelihood (ExactGP case), -# but an ELBO (evidence lower bound) objective +# but an ELBO (evidence lower bound) objective. The latter is a proxy for minimizing +# the KL divergence between distributions, which in our case are the approximate # -# $$ -# \max_\ve\zeta\left(\mathbb E_{q_{\ve\psi}(\ve u)}\left[\ln p(\ve y|\ve u) \right] - -# D_{\text{KL}}(q_{\ve\psi}(\ve u)\Vert p(\ve u))\right) -# $$ +# $$q_{\ve\zeta}(\mathbf f)=\int p(\mathbf f|\ve u)\,q_{\ve\psi}(\ve u)\,\dd\ve u\quad(\text{"variational strategy"})$$ # -# with respect to +# and the true $p(\mathbf f|\mathcal D)$ posterior over function values. We optimize with respect to # # $$\ve\zeta = [\ell, \sigma_n^2, s, c, \ve\psi] $$ # -- GitLab