diff --git a/BLcourse2.3/02_two_dim.py b/BLcourse2.3/02_two_dim.py index 1481a5dde2c7fcf65f32220e0b563cd759b3f5f0..388947a2020d9484e6e87d03e6c33b4f4a62b340 100644 --- a/BLcourse2.3/02_two_dim.py +++ b/BLcourse2.3/02_two_dim.py @@ -13,6 +13,8 @@ # name: python3 # --- +# In this notebook, we use a GP to fit a 2D data set. + # + # ##%matplotlib notebook # %matplotlib widget @@ -112,6 +114,8 @@ X_pred = data_pred.X # # Exercise 1 +# Keep the settings below and explore the notebook till the end first. + # + use_noise = False use_gap = False @@ -119,15 +123,19 @@ use_gap = False # # Exercise 2 +# First complete the notebook as is, then come back here. + # + -##use_noise = True -##use_gap = False +##use_noise = False +##use_gap = True # - # # Exercise 3 +# First complete the notebook with Exercise 2, then come back here. + # + -##use_noise = False +##use_noise = True ##use_gap = True # - @@ -146,7 +154,8 @@ else: # - # + -# Cut out part of the train data to create out-of-distribution predictions +# Cut out part of the train data to create out-of-distribution predictions. +# Same as the "gaps" we created in the 1D case. if use_gap: mask = (X_train[:, 0] > 0) & (X_train[:, 1] < 0) @@ -227,10 +236,10 @@ pprint(extract_model_params(model, raw=False)) model.train() likelihood.train() -optimizer = torch.optim.Adam(model.parameters(), lr=0.2) +optimizer = torch.optim.Adam(model.parameters(), lr=0.15) loss_func = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model) -n_iter = 300 +n_iter = 400 history = defaultdict(list) for ii in range(n_iter): optimizer.zero_grad() @@ -286,11 +295,14 @@ with torch.no_grad(): assert (post_pred_f.mean == post_pred_y.mean).all() # - +# When `use_noise=False`, then the GP's prediction is an almost pefect +# reconstruction of the ground truth function (in-distribution, so where we +# have data). # # Plot difference to ground truth and uncertainty # + -ncols = 3 +ncols = 4 fig, axs = plt.subplots(ncols=ncols, nrows=1, figsize=(ncols * 7, 5)) vmax = post_pred_y.stddev.max() @@ -307,27 +319,42 @@ cs.append( ) axs[0].set_title("|y_pred - y_true|") +f_std = post_pred_f.stddev.reshape((data_pred.nx, data_pred.ny)) +y_std = post_pred_y.stddev.reshape((data_pred.nx, data_pred.ny)) + cs.append( axs[1].contourf( data_pred.XG, data_pred.YG, - post_pred_f.stddev.reshape((data_pred.nx, data_pred.ny)), + f_std, vmin=0, vmax=vmax, ) ) -axs[1].set_title("f_std (epistemic)") +axs[1].set_title("epistemic: f_std") cs.append( axs[2].contourf( data_pred.XG, data_pred.YG, - post_pred_y.stddev.reshape((data_pred.nx, data_pred.ny)), + y_std, vmin=0, vmax=vmax, ) ) -axs[2].set_title("y_std (epistemic + aleatoric)") +axs[2].set_title("total: y_std") + +cs.append( + axs[3].contourf( + data_pred.XG, + data_pred.YG, + y_std - f_std, + vmin=0, + cmap="plasma", + ##vmax=vmax, + ) +) +axs[3].set_title("aleatoric: y_std - f_std") for ax, c in zip(axs, cs): ax.set_xlabel("X_0") @@ -336,8 +363,7 @@ for ax, c in zip(axs, cs): fig.colorbar(c, ax=ax) # - - -# # Let's check the learned noise +# ## Let's check the learned noise # + # Target noise to learn @@ -356,7 +382,50 @@ print( ) # - -# # Plot confidence bands +# # Observations + +# We have the following terms: +# +# * epistemic: `f_std` = $\sqrt{\mathrm{diag}\,\ma\Sigma}$ +# * total: `y_std` = $\sqrt{\mathrm{diag}(\ma\Sigma + \sigma_n^2\,\ma I_N)}$ +# * aleatoric: we have two ways of representing it +# * from the likelihood: $\sigma_n$ +# * for plotting: we use `y_std` - `f_std`, this is $\neq \sigma_n$ because of the $\sqrt{\cdot}$ +# above +# +# We can make the following observations: +# +# * Exercise 1: `use_noise=False`, `use_gap=False` +# * The epistemic uncertainty `f_std` is a good indicator +# of the (small) differences between model prediction and ground truth +# * The learned variance $\sigma_n^2$, and hence the aleatoric uncertainty is near zero, which makes sense for noise-free data +# * Exercise 2: `use_noise=False`, `use_gap=True` +# * When faced with out-of-distribution (OOD) data, the epistemic `f_std` clearly shows where the +# model will make wrong (less trustworthy) predictions +# * Exercise 3: `use_noise=True`, `use_gap=True` +# * in-distribution (where we have data) +# * The distinction between +# epistemic and aleatoric in the way we define it is less meaningful, hence, `f_std` doesn't +# correlate well with `y_pred - y_true`. The reason is that the noise +# $\sigma_n$ shows up in two parts: (a) in the equation of $\ma\Sigma$ +# itself, so the "epistemic" uncertainty `f_std` = $\sqrt{\mathrm{diag}\,\ma\Sigma}$ +# is bigger just because we have noise (regression) and (b) we add +# it in $\sqrt{\mathrm{diag}(\ma\Sigma + \sigma_n^2\,\ma I_N)}$ +# to get the total `y_std` +# * We learn the value of `noise_std` ($\sigma_n$) quite well and add **its square** as a constant ($\ma\Sigma + \sigma_n^2\,\ma I_N$). +# The `y_std` plot looks like the `f_std` one, but shifted by a constant. But this is not the case because +# we compare standard deviations and +# not variances, hence `y_std` - `f_std` is not constant, and in +# particular $\neq \sigma_n$, but both are in the same numerical range +# (0.15 vs. 0.2). +# * out-of-distribution: `f_std` (epistemic) dominates + +# Exercises +# +# Go back up, switch on the settings for Exercise 2 and re-run the notebook. +# Same with Exercise 3. + +# # Bonus: plot confidence bands # + y_mean = post_pred_y.mean.reshape((data_pred.nx, data_pred.ny))