diff --git a/BLcourse2.3/01_one_dim.py b/BLcourse2.3/01_one_dim.py index ec49e804ad3ddf2e071646511d677be45047a883..4566f23b857aecd3fcb5707e104f7d3cf379c40b 100644 --- a/BLcourse2.3/01_one_dim.py +++ b/BLcourse2.3/01_one_dim.py @@ -214,11 +214,12 @@ with torch.no_grad(): # - -# Let's investigate the samples more closely. A constant mean $\ve m(\ma X) = -# \ve c$ does *not* mean that each sampled vector $\predve f$'s mean is -# equal to $c$. Instead, we have that at each $\ve x_i$, the mean of -# *all* sampled functions is the same, so $\frac{1}{M}\sum_{j=1}^M f_j(\ve x_i) -# \approx c$ and for $M\rightarrow\infty$ it will be exactly $c$. +# Let's investigate the samples more closely. First we note that the samples +# fluctuate around the mean `model.mean_module.constant` we defined above. A +# constant mean $\ve m(\ma X) = \ve c$ does *not* mean that each sampled vector +# $\predve f$'s mean is equal to $c$. Instead, we have that at each $\ve x_i$, +# the mean of *all* sampled functions is the same, so $\frac{1}{M}\sum_{j=1}^M +# f_j(\ve x_i) \approx c$ and for $M\rightarrow\infty$ it will be exactly $c$. # # Look at the first 20 x points from M=10 samples @@ -242,7 +243,7 @@ print(f"{f_samples.mean(axis=0).std()=}") # # We use the fixed hyper param values defined above. In particular, since # $\sigma_n^2$ = `model.likelihood.noise_covar.noise` is > 0, we have a -# regression setting. +# regression setting -- the GP's mean doesn't interpolate all points. # + # Evaluation (predictive posterior) mode @@ -297,20 +298,23 @@ with torch.no_grad(): ax.legend() # - +# We observe that all sampled functions (green) and the mean (red) tend towards +# the low fixed mean function $m(\ve x)=c$ at 3.0 in the absence of data, while +# the actual data mean is `const` from above (data generation). Also the other +# hyper params ($\ell$, $\sigma_n^2$, $s$) are just guesses. Now we will +# calculate their actual value by minimizing the negative log marginal +# likelihood. # # Fit GP to data: optimize hyper params # # In each step of the optimizer, we condition on the training data (e.g. do # Bayesian inference) to calculate the posterior predictive distribution for -# the current values of the hyper params. We iterate until the log marginal +# the current values of the hyper params. We iterate until the negative log marginal # likelihood is converged. # # We use a simplistic PyTorch-style hand written train loop without convergence # control, so make sure to use enough `n_iter` and eyeball-check that the loss # is converged :-) -# -# Observe how all hyper params converge. In particular, note that the constant -# mean $m(\ve x)=c$ converges to the `const` value in `generate_data()`. # + # Train mode @@ -334,7 +338,7 @@ for ii in range(n_iter): history["loss"].append(loss.item()) # - -# Plot hyper params and loss (neg. log marginal likelihood) convergence +# Plot hyper params and loss (negative log marginal likelihood) convergence ncols = len(history) fig, axs = plt.subplots(ncols=ncols, nrows=1, figsize=(ncols * 5, 5)) for ax, (p_name, p_lst) in zip(axs, history.items()): @@ -345,6 +349,9 @@ for ax, (p_name, p_lst) in zip(axs, history.items()): # Values of optimized hyper params pprint(extract_model_params(model, raw=False)) +# We see that all hyper params converge. In particular, note that the constant +# mean $m(\ve x)=c$ converges to the `const` value in `generate_data()`. + # # Run prediction # # We show "noiseless" (left: $\sigma = \sqrt{\mathrm{diag}(\ma\Sigma)}$) vs.