diff --git a/BLcourse2.3/02_two_dim.py b/BLcourse2.3/02_two_dim.py
index 0ff95c63f58835f8672e775b5abce7894960cf78..6c59344978518666d3d3b2bcd11fcea1b4a5b0d3 100644
--- a/BLcourse2.3/02_two_dim.py
+++ b/BLcourse2.3/02_two_dim.py
@@ -196,6 +196,9 @@ s1 = ax.scatter(
 ax.set_xlabel("X_0")
 ax.set_ylabel("X_1")
 
+# The gray surface is the ground truth function. The blue points are the
+# training data.
+
 # # Define GP model
 
 
@@ -315,7 +318,39 @@ assert (post_pred_f.mean == post_pred_y.mean).all()
 
 # When `use_noise=False`, then the GP's prediction is an almost perfect
 # reconstruction of the ground truth function (in-distribution, so where we
-# have data). While 3D plots are fun, they are not optimal for judging how well
+# have data).
+# In this case, the plot makes the GP prediction look like a perfect
+# *interpolation* of the noise-free data, so $\test{\ve\mu} = \ve y$ at the
+# train points $\test{\ma X} = \ma X$. This
+# would be true if our GP model had exactly zero noise, so the likelihood's
+# $\sigma_n^2$ would be zero. However `print(model`)
+#
+# ```
+# ExactGPModel(
+#  (likelihood): GaussianLikelihood(
+#    (noise_covar): HomoskedasticNoise(
+#      (raw_noise_constraint): GreaterThan(1.000E-04)
+#    )
+#  )
+#  ...
+#  ```
+#
+# shows that actually the min value is $10^{-4}$, so we technically always have
+# a regression setting, just with very small noise. The reason is that in the
+# GP equations, we have
+#
+# $$\test{\ve\mu} = \test{\ma K}\,\left(\ma K+\sigma_n^2\,\ma I_N\right)^{-1}\,\ve y$$
+#
+# where $\sigma_n^2$ acts as a *regularization* parameter (also called "jitter
+# term" sometimes), which improves the
+# numerical stability of the linear system solve step
+#
+# $$\left(\ma K+\sigma_n^2\,\ma I_N\right)^{-1}\,\ve y\:.$$
+#
+# Also we always keep $\sigma_n^2$ as hyper parameter that we learn, and the
+# smallest value the hyper parameter optimization can reach is $10^{-4}$.
+#
+# While 3D plots are fun, they are not optimal for judging how well
 # the GP model represents the ground truth function.
 
 # # Plot difference to ground truth and uncertainty
@@ -433,20 +468,17 @@ print(
 # * Exercise 3: `use_noise=True`, `use_gap=True`
 #   * in-distribution (where we have data)
 #     * The distinction between
-#       epistemic and aleatoric in the way we define it is less meaningful,
+#       epistemic and aleatoric uncertainty in the way we define it is less meaningful,
 #       hence, `f_std` doesn't correlate well with `y_pred - y_true`. The
 #       reason is that the noise $\sigma_n$ shows up in two parts: (a) in the
 #       equation of $\test{\ma\Sigma}$ itself, so the "epistemic" uncertainty
 #       `f_std` = $\sqrt{\diag\test{\ma\Sigma}}$ is bigger just because we have
 #       noise (regression) and (b) we add it in $\sqrt{\diag(\test{\ma\Sigma} +
 #       \sigma_n^2\,\ma I_N)}$ to get the total `y_std`
-#     * We learn the value of `noise_std` ($\sigma_n$) quite well and add **its
-#       square** as a constant ($\test{\ma\Sigma} + \sigma_n^2\,\ma I_N$). The
-#       `y_std` plot looks like the `f_std` one, but shifted by a constant. But
-#       this is not the case because we compare standard deviations and not
+#     * The `y_std` plot looks like the `f_std` one, but shifted by a constant.
+#       But this is not the case because we compare standard deviations and not
 #       variances, hence `y_std` - `f_std` is not constant, and in particular
-#       $\neq \sigma_n$, but both are in the same numerical range (0.15 vs.
-#       0.2).
+#       $\neq \sigma_n$, but both are in the same numerical range (0.15 vs. 0.2).
 #   * out-of-distribution: `f_std` (epistemic) dominates
 
 # # Exercises
diff --git a/BLcourse2.3/03_one_dim_SVI.py b/BLcourse2.3/03_one_dim_SVI.py
index 7eb6171efd41c7872df50d31aae3ce71d1215892..644e61a37b818301dd4604bb25d74b64cb5be511 100644
--- a/BLcourse2.3/03_one_dim_SVI.py
+++ b/BLcourse2.3/03_one_dim_SVI.py
@@ -175,7 +175,7 @@ pprint(extract_model_params(model))
 print("likelihood params:")
 pprint(extract_model_params(likelihood))
 
-# Set new start hyper params
+# Set new start hyper params (scalars only)
 model.mean_module.constant = 3.0
 model.covar_module.base_kernel.lengthscale = 1.0
 model.covar_module.outputscale = 1.0