Skip to content
Snippets Groups Projects
Select Git revision
  • 98b9c2d360ce645eb535e43c530f4a07afef4696
  • main default protected
  • v0.11
  • live
  • unifiedsse
  • 2.1.1 protected
  • 2.1.0 protected
  • 2.0.1 protected
  • 2.0.0.post4 protected
  • 2.0.0.post3 protected
  • 2.0.0.post2 protected
  • 2.0.0.post1 protected
  • 2.0.0 protected
  • 2.0.0a7 protected
  • 2.0.0a6 protected
  • 2.0.0a5 protected
  • 2.0.0a4 protected
  • 2.0.0a3 protected
  • 2.0.0a1 protected
  • 0.15.0a2 protected
  • 0.15.0a1 protected
  • 0.14.2 protected
  • 0.14.1 protected
  • 0.14.0 protected
  • 0.13.6 protected
25 results

_version.py

Blame
  • 03_one_dim_SVI.py 10.93 KiB
    # ---
    # jupyter:
    #   jupytext:
    #     formats: ipynb,py:light
    #     text_representation:
    #       extension: .py
    #       format_name: light
    #       format_version: '1.5'
    #       jupytext_version: 1.17.1
    #   kernelspec:
    #     display_name: bayes-ml-course
    #     language: python
    #     name: bayes-ml-course
    # ---
    
    # # About
    #
    # In this notebook, we replace the ExactGP inference and log marginal
    # likelihood optimization by using sparse stochastic variational inference.
    # This serves as an example of the many methods `gpytorch` offers to make GPs
    # scale to large data sets.
    # $\newcommand{\ve}[1]{\mathit{\boldsymbol{#1}}}$
    # $\newcommand{\ma}[1]{\mathbf{#1}}$
    # $\newcommand{\pred}[1]{\rm{#1}}$
    # $\newcommand{\predve}[1]{\mathbf{#1}}$
    # $\newcommand{\test}[1]{#1_*}$
    # $\newcommand{\testtest}[1]{#1_{**}}$
    # $\DeclareMathOperator{\diag}{diag}$
    # $\DeclareMathOperator{\cov}{cov}$
    
    # # Imports, helpers, setup
    
    # ##%matplotlib notebook
    # %matplotlib widget
    # ##%matplotlib inline
    
    # +
    import math
    from collections import defaultdict
    from pprint import pprint
    
    import torch
    import gpytorch
    from matplotlib import pyplot as plt
    from matplotlib import is_interactive
    import numpy as np
    from torch.utils.data import TensorDataset, DataLoader
    
    from utils import extract_model_params, plot_samples
    
    
    torch.set_default_dtype(torch.float64)
    torch.manual_seed(123)
    # -
    
    # # Generate toy 1D data
    #
    # Now we generate 10x more points as in the ExactGP case, still the inference
    # won't be much slower (exact GPs scale roughly as $N^3$). Note that the data we
    # use here is still tiny (1000 points is easy even for exact GPs), so the
    # method's usefulness cannot be fully exploited in our example
    # -- also we don't even use a GPU yet :).
    
    
    # +
    def ground_truth(x, const):
        return torch.sin(x) * torch.exp(-0.2 * x) + const
    
    
    def generate_data(x, gaps=[[1, 3]], const=None, noise_std=None):
        noise_dist = torch.distributions.Normal(loc=0, scale=noise_std)
        y = ground_truth(x, const=const) + noise_dist.sample(
            sample_shape=(len(x),)
        )
        msk = torch.tensor([True] * len(x))
        if gaps is not None:
            for g in gaps:
                msk = msk & ~((x > g[0]) & (x < g[1]))
        return x[msk], y[msk], y
    
    
    const = 5.0
    noise_std = 0.1
    x = torch.linspace(0, 4 * math.pi, 1000)
    X_train, y_train, y_gt_train = generate_data(
        x, gaps=[[6, 10]], const=const, noise_std=noise_std
    )
    X_pred = torch.linspace(
        X_train[0] - 2, X_train[-1] + 2, 200, requires_grad=False
    )
    y_gt_pred = ground_truth(X_pred, const=const)
    
    print(f"{X_train.shape=}")
    print(f"{y_train.shape=}")
    print(f"{X_pred.shape=}")
    
    fig, ax = plt.subplots()
    ax.scatter(X_train, y_train, marker="o", color="tab:blue", label="noisy data")
    ax.plot(X_pred, y_gt_pred, ls="--", color="k", label="ground truth")
    ax.legend()
    # -
    
    # # Define GP model
    #
    # The model follows [this
    # example](https://docs.gpytorch.ai/en/stable/examples/04_Variational_and_Approximate_GPs/SVGP_Regression_CUDA.html)
    # based on [Hensman et al., "Scalable Variational Gaussian Process Classification",
    # 2015](https://proceedings.mlr.press/v38/hensman15.html). The model is
    # "sparse" since it works with a set of *inducing* points $(\ma Z, \ve u),
    # \ve u=f(\ma Z)$ which is much smaller than the train data $(\ma X, \ve y)$.
    #
    # We have the same hyper parameters as before
    #
    # * $\ell$ = `model.covar_module.base_kernel.lengthscale`
    # * $\sigma_n^2$ = `likelihood.noise_covar.noise`
    # * $s$ = `model.covar_module.outputscale`
    # * $m(\ve x) = c$ = `model.mean_module.constant`
    #
    # plus additional ones, introduced by the approximations used:
    #
    # * the learnable inducing points $\ma Z$ for the variational distribution
    #   $q_{\ve\psi}(\ve u)$
    # * learnable parameters of the variational distribution $q_{\ve\psi}(\ve u)=\mathcal N(\ve m_u, \ma S)$: the
    #   variational mean $\ve m_u$ and covariance $\ma S$ in form a lower triangular
    #   matrix $\ma L$ such that $\ma S=\ma L\,\ma L^\top$
    
    
    # +
    class ApproxGPModel(gpytorch.models.ApproximateGP):
        def __init__(self, Z):
            # Approximate inducing value posterior q(u), u = f(Z), Z = inducing
            # points (subset of X_train)
            variational_distribution = (
                gpytorch.variational.CholeskyVariationalDistribution(Z.size(0))
            )
            # Compute q(f(X)) from q(u)
            variational_strategy = gpytorch.variational.VariationalStrategy(
                self,
                Z,
                variational_distribution,
                learn_inducing_locations=True,
            )
            super().__init__(variational_strategy)
            self.mean_module = gpytorch.means.ConstantMean()
            self.covar_module = gpytorch.kernels.ScaleKernel(
                gpytorch.kernels.RBFKernel()
            )
    
        def forward(self, x):
            mean_x = self.mean_module(x)
            covar_x = self.covar_module(x)
            return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)
    
    
    likelihood = gpytorch.likelihoods.GaussianLikelihood()
    
    n_train = len(X_train)
    # Start values for inducing points Z, use 10% random sub-sample of X_train.
    ind_points_fraction = 0.1
    ind_idxs = torch.randperm(n_train)[: int(n_train * ind_points_fraction)]
    model = ApproxGPModel(Z=X_train[ind_idxs])
    # -
    
    
    # Inspect the model
    print(model)
    
    # Inspect the likelihood. In contrast to ExactGP, the likelihood is not part of
    # the GP model instance.
    print(likelihood)
    
    # Default start hyper params
    print("model params:")
    pprint(extract_model_params(model))
    print("likelihood params:")
    pprint(extract_model_params(likelihood))
    
    # Set new start hyper params
    model.mean_module.constant = 3.0
    model.covar_module.base_kernel.lengthscale = 1.0
    model.covar_module.outputscale = 1.0
    likelihood.noise_covar.noise = 0.3
    
    # # Fit GP to data: optimize hyper params
    #
    # Now we optimize the GP hyper parameters by doing a GP-specific variational inference (VI),
    # where we optimize not the log marginal likelihood (ExactGP case),
    # but an ELBO (evidence lower bound) objective
    #
    # $$
    # \max_\ve\zeta\left(\mathbb E_{q_{\ve\psi}(\ve u)}\left[\ln p(\ve y|\ve u) \right] -
    # D_{\text{KL}}(q_{\ve\psi}(\ve u)\Vert p(\ve u))\right)
    # $$
    #
    # with respect to
    #
    # $$\ve\zeta = [\ell, \sigma_n^2, s, c, \ve\psi] $$
    #
    # with
    #
    # $$\ve\psi = [\ve m_u, \ma Z, \ma L]$$
    #
    # the parameters of the variational distribution $q_{\ve\psi}(\ve u)$.
    #
    # In addition, we perform a stochastic
    # optimization by using a deep learning type mini-batch loop, hence
    # "stochastic" variational inference (SVI). The latter speeds up the
    # optimization since we only look at a fraction of data per optimizer step to
    # calculate an approximate loss gradient (`loss.backward()`).
    
    # +
    # Train mode
    model.train()
    likelihood.train()
    
    optimizer = torch.optim.Adam(
        [dict(params=model.parameters()), dict(params=likelihood.parameters())],
        lr=0.1,
    )
    loss_func = gpytorch.mlls.VariationalELBO(
        likelihood, model, num_data=X_train.shape[0]
    )
    
    train_dl = DataLoader(
        TensorDataset(X_train, y_train), batch_size=128, shuffle=True
    )
    
    n_iter = 200
    history = defaultdict(list)
    for i_iter in range(n_iter):
        for i_batch, (X_batch, y_batch) in enumerate(train_dl):
            batch_history = defaultdict(list)
            optimizer.zero_grad()
            loss = -loss_func(model(X_batch), y_batch)
            loss.backward()
            optimizer.step()
            param_dct = dict()
            param_dct.update(extract_model_params(model, try_item=True))
            param_dct.update(extract_model_params(likelihood, try_item=True))
            for p_name, p_val in param_dct.items():
                if isinstance(p_val, float):
                    batch_history[p_name].append(p_val)
            batch_history["loss"].append(loss.item())
        for p_name, p_lst in batch_history.items():
            history[p_name].append(np.mean(p_lst))
        if (i_iter + 1) % 10 == 0:
            print(f"iter {i_iter + 1}/{n_iter}, {loss=:.3f}")
    # -
    
    # Plot scalar hyper params and loss (ELBO) convergence
    ncols = len(history)
    fig, axs = plt.subplots(
        ncols=ncols, nrows=1, figsize=(ncols * 3, 3), layout="compressed"
    )
    with torch.no_grad():
        for ax, (p_name, p_lst) in zip(axs, history.items()):
            ax.plot(p_lst)
            ax.set_title(p_name)
            ax.set_xlabel("iterations")
    
    # Values of optimized hyper params
    print("model params:")
    pprint(extract_model_params(model))
    print("likelihood params:")
    pprint(extract_model_params(likelihood))
    
    
    # # Run prediction
    
    # +
    # Evaluation (predictive posterior) mode
    model.eval()
    likelihood.eval()
    
    with torch.no_grad():
        M = 10
        post_pred_f = model(X_pred)
        post_pred_y = likelihood(model(X_pred))
    
        fig, axs = plt.subplots(ncols=2, figsize=(14, 5), sharex=True, sharey=True)
        fig_sigmas, ax_sigmas = plt.subplots()
        for ii, (ax, post_pred, name, title) in enumerate(
            zip(
                axs,
                [post_pred_f, post_pred_y],
                ["f", "y"],
                ["epistemic uncertainty", "total uncertainty"],
            )
        ):
            yf_mean = post_pred.mean
            yf_samples = post_pred.sample(sample_shape=torch.Size((M,)))
    
            yf_std = post_pred.stddev
            lower = yf_mean - 2 * yf_std
            upper = yf_mean + 2 * yf_std
            ax.plot(
                X_train.numpy(),
                y_train.numpy(),
                "o",
                label="data",
                color="tab:blue",
            )
            ax.plot(
                X_pred.numpy(),
                yf_mean.numpy(),
                label="mean",
                color="tab:red",
                lw=2,
            )
            ax.plot(
                X_pred.numpy(),
                y_gt_pred.numpy(),
                label="ground truth",
                color="k",
                lw=2,
                ls="--",
            )
            ax.fill_between(
                X_pred.numpy(),
                lower.numpy(),
                upper.numpy(),
                label="confidence",
                color="tab:orange",
                alpha=0.3,
            )
            ax.set_title(f"confidence = {title}")
            if name == "f":
                sigma_label = r"epistemic: $\pm 2\sqrt{\mathrm{diag}(\Sigma_*)}$"
                zorder = 1
            else:
                sigma_label = (
                    r"total: $\pm 2\sqrt{\mathrm{diag}(\Sigma_* + \sigma_n^2\,I)}$"
                )
                zorder = 0
            ax_sigmas.fill_between(
                X_pred.numpy(),
                lower.numpy(),
                upper.numpy(),
                label=sigma_label,
                color="tab:orange" if name == "f" else "tab:blue",
                alpha=0.5,
                zorder=zorder,
            )
            y_min = y_train.min()
            y_max = y_train.max()
            y_span = y_max - y_min
            ax.set_ylim([y_min - 0.3 * y_span, y_max + 0.3 * y_span])
            plot_samples(ax, X_pred, yf_samples, label="posterior pred. samples")
            if ii == 1:
                ax.legend()
        ax_sigmas.set_title("total vs. epistemic uncertainty")
        ax_sigmas.legend()
    # -
    
    # # Let's check the learned noise
    
    # +
    # Target noise to learn
    print("data noise:", noise_std)
    
    # The two below must be the same
    print(
        "learned noise:",
        (post_pred_y.stddev**2 - post_pred_f.stddev**2).mean().sqrt().item(),
    )
    print(
        "learned noise:",
        np.sqrt(
            extract_model_params(likelihood, try_item=True)["noise_covar.noise"]
        ),
    )
    # -
    
    # When running as script
    if not is_interactive():
        plt.show()