diff --git a/BLcourse2.3/01_simple_gp_regression.ipynb b/BLcourse2.3/01_simple_gp_regression.ipynb
index b6581706745e41359849d48b271ce9febbd37d58..826c08fab29a8ac94ae682dccf9749d9dc38c42c 100644
--- a/BLcourse2.3/01_simple_gp_regression.ipynb
+++ b/BLcourse2.3/01_simple_gp_regression.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "markdown",
-   "id": "8bc26c14",
+   "id": "16bd8a69",
    "metadata": {},
    "source": [
     "# Notation\n",
@@ -23,7 +23,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "3ef36e32",
+   "id": "87093c43",
    "metadata": {},
    "source": [
     "# Imports, helpers, setup"
@@ -32,7 +32,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "c86d69b2",
+   "id": "69a0af7f",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -105,7 +105,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "9c543940",
+   "id": "eba6d895",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -122,7 +122,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "0674cc5a",
+   "id": "24e8765e",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -150,7 +150,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "e4ddc32f",
+   "id": "9b980b1c",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -179,7 +179,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "93822ff7",
+   "id": "a0a0c12c",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -215,7 +215,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "3288a7ca",
+   "id": "e2778668",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -226,7 +226,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "9348a1db",
+   "id": "eacc9f2b",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -237,7 +237,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "b2b5d326",
+   "id": "f90ce51c",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -254,7 +254,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "c7309ee3",
+   "id": "eb9ed01f",
    "metadata": {},
    "source": [
     "# Sample from the GP prior\n",
@@ -269,7 +269,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "56b26cd6",
+   "id": "894b7e36",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -305,7 +305,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "6758e47f",
+   "id": "33bf94f7",
    "metadata": {},
    "source": [
     "Let's investigate the samples more closely. A constant mean $\\ve m(\\ma X) =\n",
@@ -318,7 +318,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "69ae529a",
+   "id": "ff0dcbaa",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -332,7 +332,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "df46aa3f",
+   "id": "7018cb26",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -346,7 +346,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "5a9350d8",
+   "id": "ea005dc2",
    "metadata": {},
    "source": [
     "# Fit GP to data: optimize hyper params\n",
@@ -366,7 +366,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "638d1244",
+   "id": "67b5961b",
    "metadata": {
     "lines_to_next_cell": 2
    },
@@ -403,7 +403,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "777b67d9",
+   "id": "5faec1a4",
    "metadata": {},
    "source": [
     "# Run prediction\n",
@@ -424,7 +424,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "740fb21a",
+   "id": "e56fd041",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -483,7 +483,7 @@
  ],
  "metadata": {
   "jupytext": {
-   "formats": "ipynb,py:percent"
+   "formats": "ipynb,py:light"
   },
   "kernelspec": {
    "display_name": "Python 3 (ipykernel)",
diff --git a/BLcourse2.3/01_simple_gp_regression.py b/BLcourse2.3/01_simple_gp_regression.py
index fcfcda544ec40c9be131cde8bb0937b0980f886c..a48f086fccda0a7586a468ac21598987296be2e8 100644
--- a/BLcourse2.3/01_simple_gp_regression.py
+++ b/BLcourse2.3/01_simple_gp_regression.py
@@ -1,19 +1,18 @@
 # ---
 # jupyter:
 #   jupytext:
-#     formats: ipynb,py:percent
+#     formats: ipynb,py:light
 #     text_representation:
 #       extension: .py
-#       format_name: percent
-#       format_version: '1.3'
-#       jupytext_version: 1.14.4
+#       format_name: light
+#       format_version: '1.5'
+#       jupytext_version: 1.16.2
 #   kernelspec:
 #     display_name: Python 3 (ipykernel)
 #     language: python
 #     name: python3
 # ---
 
-# %% [markdown]
 # # Notation
 # $\newcommand{\ve}[1]{\mathit{\boldsymbol{#1}}}$
 # $\newcommand{\ma}[1]{\mathbf{#1}}$
@@ -29,10 +28,9 @@
 # we still denote the collection of all $\ve x_i = x_i\in\mathbb R$ points with
 # $\ma X$ to keep the notation consistent with the slides.
 
-# %% [markdown]
 # # Imports, helpers, setup
 
-# %%
+# +
 # %matplotlib inline
 
 import math
@@ -98,8 +96,8 @@ def plot_samples(ax, X_pred, samples, label=None, **kwds):
 torch.set_default_dtype(torch.float64)
 
 torch.manual_seed(123)
+# -
 
-# %% [markdown]
 # # Generate toy 1D data
 #
 # Here we generate noisy 1D data `X_train`, `y_train` as well as an extended
@@ -109,7 +107,7 @@ torch.manual_seed(123)
 # show how the model uncertainty will behave there.
 
 
-# %%
+# +
 def generate_data(x, gaps=[[1, 3]], const=5):
     y = torch.sin(x) * torch.exp(-0.2 * x) + torch.randn(x.shape) * 0.1 + const
     msk = torch.tensor([True] * len(x))
@@ -130,8 +128,8 @@ print(f"{y_train.shape=}")
 print(f"{X_pred.shape=}")
 
 plt.scatter(X_train, y_train, marker="o", color="tab:blue")
+# -
 
-# %% [markdown]
 # # Define GP model
 #
 # We define the simplest possible textbook GP model using a Gaussian
@@ -153,7 +151,7 @@ plt.scatter(X_train, y_train, marker="o", color="tab:blue")
 # * $m(\ve x) = c$ = `model.mean_module.constant`
 
 
-# %%
+# +
 class ExactGPModel(gpytorch.models.ExactGP):
     """API:
 
@@ -179,17 +177,16 @@ class ExactGPModel(gpytorch.models.ExactGP):
 
 likelihood = gpytorch.likelihoods.GaussianLikelihood()
 model = ExactGPModel(X_train, y_train, likelihood)
+# -
 
 
-# %%
 # Inspect the model
 print(model)
 
-# %%
 # Default start hyper params
 pprint(extract_model_params(model, raw=False))
 
-# %%
+# +
 # Set new start hyper params
 model.mean_module.constant = 3.0
 model.covar_module.base_kernel.lengthscale = 1.0
@@ -197,9 +194,9 @@ model.covar_module.outputscale = 1.0
 model.likelihood.noise_covar.noise = 0.1
 
 pprint(extract_model_params(model, raw=False))
+# -
 
 
-# %% [markdown]
 # # Sample from the GP prior
 #
 # We sample a number of functions $f_j, j=1,\ldots,M$ from the GP prior and
@@ -208,7 +205,7 @@ pprint(extract_model_params(model, raw=False))
 # c, \ma K)$. Each sampled vector $\pred{\ve y}\in\mathbb R^{N'}$ and the
 # covariance (kernel) matrix is $\ma K\in\mathbb R^{N'\times N'}$.
 
-# %%
+# +
 model.eval()
 likelihood.eval()
 
@@ -235,9 +232,9 @@ with torch.no_grad():
         label="confidence",
     )
     ax.legend()
+# -
 
 
-# %% [markdown]
 # Let's investigate the samples more closely. A constant mean $\ve m(\ma X) =
 # \ve c$ does *not* mean that each sampled vector $\pred{\ve y}$'s mean is
 # equal to $c$. Instead, we have that at each $\ve x_i$, the mean of
@@ -245,14 +242,12 @@ with torch.no_grad():
 # \approx c$ and for $M\rightarrow\infty$ it will be exactly $c$.
 #
 
-# %%
 # Look at the first 20 x points from M=10 samples
 print(f"{f_samples.shape=}")
 print(f"{f_samples.mean(axis=0)[:20]=}")
 print(f"{f_samples.mean(axis=0).mean()=}")
 print(f"{f_samples.mean(axis=0).std()=}")
 
-# %%
 # Take more samples, the means should get closer to c
 f_samples = pri_f.sample(sample_shape=torch.Size((M * 200,)))
 print(f"{f_samples.shape=}")
@@ -260,7 +255,6 @@ print(f"{f_samples.mean(axis=0)[:20]=}")
 print(f"{f_samples.mean(axis=0).mean()=}")
 print(f"{f_samples.mean(axis=0).std()=}")
 
-# %% [markdown]
 # # Fit GP to data: optimize hyper params
 #
 # In each step of the optimizer, we condition on the training data (e.g. do
@@ -274,7 +268,7 @@ print(f"{f_samples.mean(axis=0).std()=}")
 # Observe how all hyper params converge. In particular, note that the constant
 # mean $m(\ve x)=c$ converges to the `const` value in `generate_data()`.
 
-# %%
+# +
 # Train mode
 model.train()
 likelihood.train()
@@ -302,11 +296,11 @@ for ax, (p_name, p_lst) in zip(axs, history.items()):
     ax.plot(p_lst)
     ax.set_title(p_name)
     ax.set_xlabel("iterations")
+# -
 
 
-# %% [markdown]
 # # Run prediction
-
+#
 # We show "noiseless" (left: $\sigma = \sqrt{\mathrm{diag}(\ma\Sigma)}$) vs.
 # "noisy" (right: $\sigma = \sqrt{\mathrm{diag}(\ma\Sigma + \sigma_n^2\,\ma
 # I_N)}$) predictions, where $\ma\Sigma\equiv\cov(\ve f_*)$ is the posterior
@@ -319,7 +313,7 @@ for ax, (p_name, p_lst) in zip(axs, history.items()):
 # https://elcorto.github.io/gp_playground/content/gp_pred_comp/notebook_plot.html
 # for details.
 
-# %%
+# +
 # Evaluation (predictive posterior) mode
 model.eval()
 likelihood.eval()