diff --git a/pySDC/implementations/problem_classes/GrayScott_MPIFFT.py b/pySDC/implementations/problem_classes/GrayScott_MPIFFT.py
index 796a359b142101e0c26b440e695dacf88b297d10..cabc012929132b0860df2bc73a0594f616609f96 100644
--- a/pySDC/implementations/problem_classes/GrayScott_MPIFFT.py
+++ b/pySDC/implementations/problem_classes/GrayScott_MPIFFT.py
@@ -19,7 +19,7 @@ class grayscott_imex_diffusion(IMEX_Laplacian_MPIFFT):
         \frac{\partial u}{\partial t} = D_u \Delta u - u v^2 + A (1 - u),
 
     .. math::
-        \frac{\partial v}{\partial t} = D_v \Delta v + u v^2 - B u
+        \frac{\partial v}{\partial t} = D_v \Delta v + u v^2 - B v
 
     in :math:`x \in \Omega:=[-L/2, L/2]^N` with :math:`N=2,3`. Spatial discretization is done by using
     Fast Fourier transformation for solving the linear parts provided by ``mpi4py-fft`` [2]_, see also
@@ -222,7 +222,7 @@ class grayscott_imex_diffusion(IMEX_Laplacian_MPIFFT):
 
             for _ in range(-self.num_blobs):
                 x0 = rng.random(size=self.ndim) * self.L[0] - self.L[0] / 2
-                l = rng.random(size=self.ndim) * self.L[0] / self.nvars[0] * 30
+                l = rng.random(size=self.ndim) * self.L[0] / self.nvars[0] * 80
 
                 masks = [xp.logical_and(self.X[i] > x0[i], self.X[i] < x0[i] + l[i]) for i in range(self.ndim)]
                 mask = masks[0]
@@ -236,33 +236,54 @@ class grayscott_imex_diffusion(IMEX_Laplacian_MPIFFT):
             """
             Blobs as in https://www.chebfun.org/examples/pde/GrayScott.html
             """
-            assert self.ndim == 2, 'The initial conditions are 2D for now..'
-
             inc = self.L[0] / (self.num_blobs + 1)
 
             for i in range(1, self.num_blobs + 1):
                 for j in range(1, self.num_blobs + 1):
-                    signs = (-1) ** rng.integers(low=0, high=2, size=2)
-
-                    # This assumes that the box is [-L/2, L/2]^2
-                    _u[...] += -xp.exp(
-                        -80.0
-                        * (
-                            (self.X[0] + self.x0 + inc * i + signs[0] * 0.05) ** 2
-                            + (self.X[1] + self.x0 + inc * j + signs[1] * 0.02) ** 2
+                    signs = (-1) ** rng.integers(low=0, high=2, size=self.ndim)
+
+                    if self.ndim == 2:
+                        # This assumes that the box is [-L/2, L/2]^2
+                        _u[...] += -xp.exp(
+                            -80.0
+                            * (
+                                (self.X[0] + self.x0 + inc * i + signs[0] * 0.05) ** 2
+                                + (self.X[1] + self.x0 + inc * j + signs[1] * 0.02) ** 2
+                            )
+                        )
+                        _v[...] += xp.exp(
+                            -80.0
+                            * (
+                                (self.X[0] + self.x0 + inc * i - signs[0] * 0.05) ** 2
+                                + (self.X[1] + self.x0 + inc * j - signs[1] * 0.02) ** 2
+                            )
+                        )
+                    elif self.ndim == 3:
+                        z_pos = self.x0 + rng.random() * self.L[2]
+                        # This assumes that the box is [-L/2, L/2]^3
+                        _u[...] += -xp.exp(
+                            -80.0
+                            * (
+                                (self.X[0] + self.x0 + inc * i + signs[0] * 0.05) ** 2
+                                + (self.X[1] + self.x0 + inc * j + signs[1] * 0.02) ** 2
+                                + (self.X[2] - z_pos + signs[2] * 0.035) ** 2
+                            )
                         )
-                    )
-                    _v[...] += xp.exp(
-                        -80.0
-                        * (
-                            (self.X[0] + self.x0 + inc * i - signs[0] * 0.05) ** 2
-                            + (self.X[1] + self.x0 + inc * j - signs[1] * 0.02) ** 2
+                        _v[...] += xp.exp(
+                            -80.0
+                            * (
+                                (self.X[0] + self.x0 + inc * i - signs[0] * 0.05) ** 2
+                                + (self.X[1] + self.x0 + inc * j - signs[1] * 0.02) ** 2
+                                + (self.X[2] - z_pos - signs[2] * 0.035) ** 2
+                            )
                         )
-                    )
+                    else:
+                        raise NotImplementedError
 
             _u += 1
         else:
-            raise NotImplementedError
+            _u[...] = rng.random(_u.shape)
+            _v[...] = rng.random(_v.shape)
 
         u = self.u_init
         if self.spectral:
diff --git a/pySDC/implementations/problem_classes/generic_MPIFFT_Laplacian.py b/pySDC/implementations/problem_classes/generic_MPIFFT_Laplacian.py
index 8f4ee38efb759abe0608dc8ed6d4ca7ac077d3c2..223b931cb42b6e0ffc29cabadb8cbeaee8a6235f 100644
--- a/pySDC/implementations/problem_classes/generic_MPIFFT_Laplacian.py
+++ b/pySDC/implementations/problem_classes/generic_MPIFFT_Laplacian.py
@@ -85,6 +85,7 @@ class IMEX_Laplacian_MPIFFT(Problem):
             collapse=True,
             backend=self.fft_backend,
             comm_backend=self.fft_comm_backend,
+            grid=(-1,),
         )
 
         # get test data to figure out type and dimensions
diff --git a/pySDC/projects/GPU/README.rst b/pySDC/projects/GPU/README.rst
index b0fe8173aa38bf814ac3ffedaa56f39d0e09995f..01c64b08b86f4e2a98cdaf5d8fa1c0ec0cf6d396 100644
--- a/pySDC/projects/GPU/README.rst
+++ b/pySDC/projects/GPU/README.rst
@@ -45,18 +45,84 @@ For instance, use
 
 .. code-block:: bash
  
-    srun -n 4 python work_precision.py --config=GS_USkate --procs=1/1/4 --useGPU=True --mode=run
-    mpirun -np 8 python work_precision.py --config=GS_USkate --procs=1/1/4 --useGPU=True --mode=plot
-    python work_precision.py --config=GS_USkate --procs=1/1/4 --useGPU=True --mode=video
+    srun -n 4 python run_experiment.pyy --config=GS_USkate --procs=1/1/4 --useGPU=True --mode=run
+    mpirun -np 8 python run_experiment.py --config=GS_USkate --procs=1/1/4 --useGPU=True --mode=plot
+    python run_experiment.py --config=GS_USkate --procs=1/1/4 --useGPU=True --mode=video
 
 to first run the problem, then make plots and then make a video for Gray-Scott with the U-Skate configuration (see arXiv:1501.01990).
 
 To do a parallel scaling test, you can go to JUWELS Booster and use, for instance,
 
 .. code-block:: bash
-   python analysis_scripts/parallel_scaling.py --mode=run --scaling=strong --space_time=True --XPU=GPU --problem=GS
-   srun python analysis_scripts/parallel_scaling.py --mode=plot --scaling=strong --space_time=True --XPU=GPU --problem=GS
+
+   python analysis_scripts/parallel_scaling.py --mode=run --space_time=True --XPU=GPU --problem=GS3D
+   python analysis_scripts/parallel_scaling.py --mode=plot --space_time=True --XPU=GPU --problem=GS3D
 
 This will generate jobscripts and submit the jobs. Notice that you have to wait for the jobs to complete before you can plot them.
 
 To learn more about the options for the scripts, run them with `--help`.
+
+Reproducing plots in Thomas Baumann's thesis
+--------------------------------------------
+Keep in mind that the results of the experiments are specific to the hardware that was used in the experiments.
+To record the data for space-time parallel scaling experiments with Gray-Scott and RBC, run the following commands on the specified machines within the directory that contains this README.
+
+.. code-block:: bash
+
+    # run on JUWELS
+    python analysis_scripts/parallel_scaling.py --mode=run --problem=GS3D --XPU=CPU --space_time=False
+    python analysis_scripts/parallel_scaling.py --mode=run --problem=GS3D --XPU=CPU --space_time=True
+
+    # run on JUWELS booster
+    python analysis_scripts/parallel_scaling.py --mode=run --problem=GS3D --XPU=GPU --space_time=False
+    python analysis_scripts/parallel_scaling.py --mode=run --problem=GS3D --XPU=GPU --space_time=True
+
+    # run on JURECA DC
+    python analysis_scripts/parallel_scaling.py --mode=run --problem=RBC --XPU=CPU --space_time=False
+    python analysis_scripts/parallel_scaling.py --mode=run --problem=RBC --XPU=CPU --space_time=True
+
+    # run on JUWELS booster
+    python analysis_scripts/parallel_scaling.py --mode=run --problem=RBC --XPU=GPU --space_time=False
+    python analysis_scripts/parallel_scaling.py --mode=run --problem=RBC --XPU=GPU --space_time=True
+
+These commands will submit a bunch of jobscripts with the individual runs.
+Keep in mind that these are specific to a compute project and some paths are account-specific.
+Most likely, you will have to change options at the top of the file `./etc/generate_jobscript.py` before you can run anything.
+Also, notice that you may not be allowed to request all resources needed for the largest Gray-Scott GPU run during normal operation of JUWELS booster.
+
+After all jobs have run to completion, you have recorded all scaling data and may plot the results with the following command:
+
+.. code-block:: bash
+
+    python paper_plots.py --target=thesis
+
+In order to run the production runs, modify the `path` class attribute of `LargeSim` in `analysis_scripts/large_simulations.py`.
+Then use the following commands on the specified machines:
+
+.. code-block:: bash
+
+    # run on JUWELS booster
+    python analysis_scripts/large_simulations.py --mode=run --problem=GS --XPU=GPU
+
+    # run on JURECA DC
+    python analysis_scripts/large_simulations.py --mode=run --problem=RBC --XPU=CPU
+
+Plotting the results of the Gray-Scott simulation requires a lot of memory and will take very long.
+Modify the paths in `analysis_scripts/plot_large_simulations.py` and then run:
+
+.. code-block:: bash
+
+    python analysis_scripts/3d_plot_GS_large.py --base_path=<path>
+    python analysis_scripts/plot_large_simulations.py --problem=GS
+
+Plotting the results of the Rayleigh-Benard production run is more easy.
+After modifying the paths as earlier, run the following commands:
+
+.. code-block:: bash
+
+    python analysis_scripts/large_simulations.py --mode=plot --problem=RBC --XPU=CPU
+    python analysis_scripts/large_simulations.py --mode=video --problem=RBC --XPU=CPU
+    python analysis_scripts/plot_large_simulations.py --problem=RBC
+    
+Run scripts with `--help` to learn more about parameters.
+Keep in mind that not all features are supported with all problems.
diff --git a/pySDC/projects/GPU/analysis_scripts/3d_plot_GS_large.py b/pySDC/projects/GPU/analysis_scripts/3d_plot_GS_large.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9f18579d59afa860495b6cc3169fb3c9c36240f
--- /dev/null
+++ b/pySDC/projects/GPU/analysis_scripts/3d_plot_GS_large.py
@@ -0,0 +1,162 @@
+import numpy as np
+import pickle
+import pyvista as pv
+import subprocess
+import gc
+from mpi4py import MPI
+from tqdm import tqdm
+
+
+class Grid:
+    def __init__(self, grid, label, zoom_range):
+        self.grid = grid
+        self.label = label
+        self.zoom_range = zoom_range
+
+    def get_camera_path(self):
+        import os
+
+        os.makedirs('etc/cameras', exist_ok=True)
+        return f'etc/cameras/cam{self.label}_{len(self.grid.x)}.pickle'
+
+    def get_camera_pos(self):
+        with open(self.get_camera_path(), 'rb') as file:
+            return pickle.load(file)
+
+    def set_camera_pos(self, pos):
+        with open(self.get_camera_path(), 'wb') as file:
+            pickle.dump(pos, file)
+
+    def get_zoom(self, frame):
+        return (self.zoom_range[1] - self.zoom_range[0]) * frame / 100 + self.zoom_range[0]
+
+
+def plot(
+    n_time,
+    n_space,
+    useGPU,
+    n_frames,
+    base_path,
+    space_range,
+    res,
+    start_frame=0,
+    plot_resolution=2048,
+    n_samples=1024,
+    zoom=1e-2,
+):  # pragma: no cover
+    comm = MPI.COMM_WORLD
+
+    space_range = tqdm(space_range)
+    space_range.set_description('load files')
+
+    for frame in range(start_frame, n_frames, comm.size):
+        i = frame + comm.rank
+
+        v = None
+        gc.collect()
+        for procs in space_range:
+            gc.collect()
+
+            path = f'{base_path}/GrayScottLarge-res_{res}-useGPU_{useGPU}-procs_1_{n_time}_{n_space}-0-{procs}-solution_{i:06d}.pickle'
+            with open(path, 'rb') as file:
+                _data = pickle.load(file)
+
+            if v is None:
+                shape = _data['shape']
+                v = pv.ImageData(dimensions=shape, spacing=tuple([1 / me for me in shape]))
+                v['values'] = np.zeros(np.prod(shape))
+
+            local_slice_flat = slice(np.prod(_data['v'].shape) * procs, np.prod(_data['v'].shape) * (procs + 1))
+            v['values'][local_slice_flat] = _data['v'].flatten()
+
+        # sampled = Grid(pv.ImageData(dimensions=(n_samples,) * 3, spacing=(1 / n_samples,) * 3), '', [1.0, 1.0])
+        zoomed = Grid(
+            pv.ImageData(dimensions=(n_samples,) * 3, spacing=(zoom / n_samples,) * 3, origin=[0.8, 0.47, 0.03]),
+            '_zoomed',
+            [1.0, 1.2],
+        )
+
+        for grid in [zoomed]:
+
+            p = pv.Plotter(off_screen=True)
+            contours = grid.grid.sample(v, progress_bar=True, categorical=True).contour(
+                isosurfaces=[0.3], method='flying_edges', progress_bar=True
+            )
+
+            p.add_mesh(contours, opacity=0.5, cmap=['teal'])
+            p.remove_scalar_bar()
+            p.camera.azimuth += 15
+
+            p.camera.Elevation(0.9)
+            plotting_path = './simulation_plots/'
+
+            path = f'{plotting_path}/GS_large_{i:06d}{grid.label}.png'
+
+            if frame == 0:
+                grid.set_camera_pos(p.camera.position)
+
+            p.camera.position = tuple(pos * grid.get_zoom(frame) for pos in grid.get_camera_pos())
+
+            p.screenshot(path, window_size=(plot_resolution,) * 2)
+            print(f'Saved {path}', flush=True)
+
+
+def video(view=None):  # pragma: no cover
+    path = f'simulation_plots/GS_large_%06d{view}.png'
+    path_target = f'videos/GS_large{view}.mp4'
+
+    cmd = f'ffmpeg -i {path} -pix_fmt yuv420p -r 9 -s 2048:1536 -y {path_target}'.split()
+
+    subprocess.run(cmd)
+
+
+if __name__ == '__main__':
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--XPU', type=str, choices=['CPU', 'GPU'], default='GPU')
+    parser.add_argument('--mode', type=str, choices=['plot', 'video'], default='plot')
+    parser.add_argument('--nframes', type=int, default=100)
+    parser.add_argument('--start', type=int, default=0)
+    parser.add_argument('--base_path', type=str, default='/p/scratch/ccstma/baumann7/large_runs/data/')
+    parser.add_argument('--space_range', type=int, default=None)
+    parser.add_argument('--zoom', type=float, default=9e-2)
+    parser.add_argument('--n_samples', type=int, default=1024)
+    args = parser.parse_args()
+
+    from pySDC.projects.GPU.analysis_scripts.large_simulations import GSLarge
+
+    sim = GSLarge()
+
+    if args.XPU == 'CPU':
+        sim.setup_CPU_params()
+    elif args.XPU == 'GPU':
+        sim.setup_GPU_params()
+    else:
+        raise NotImplementedError()
+
+    space_range = range(sim.params['procs'][2] if args.space_range is None else args.space_range)
+
+    if args.mode == 'plot':
+        pv.global_theme.allow_empty_mesh = True
+        try:
+            pv.start_xvfb(window_size=(4096, 4096))
+        except OSError:
+            pass
+        plot(
+            n_time=sim.params['procs'][1],
+            n_space=sim.params['procs'][2],
+            useGPU=sim.params['useGPU'],
+            base_path=args.base_path,
+            space_range=space_range,
+            n_frames=args.nframes,
+            res=sim.params['res'],
+            start_frame=args.start,
+            n_samples=args.n_samples,
+            zoom=args.zoom,
+        )
+    elif args.mode == 'video':
+        for view in ['', '_zoomed']:
+            video(view)
+    else:
+        raise NotImplementedError()
diff --git a/pySDC/projects/GPU/analysis_scripts/large_simulations.py b/pySDC/projects/GPU/analysis_scripts/large_simulations.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b8692d77cd6e1bf1a685084960866b334673701
--- /dev/null
+++ b/pySDC/projects/GPU/analysis_scripts/large_simulations.py
@@ -0,0 +1,229 @@
+import numpy as np
+from pySDC.projects.GPU.etc.generate_jobscript import write_jobscript
+
+
+class LargeSim:
+    config = None
+    params = {}
+    path = '/p/scratch/ccstma/baumann7/large_runs'
+    email = 't.baumann@fz-juelich.de'
+
+    def setup_CPU_params(self):
+        raise NotImplementedError()
+
+    def setup_GPU_params(self):
+        raise NotImplementedError()
+
+    @property
+    def plotting_params(self):
+        return {
+            'tasks_per_node': 10,
+            'partition': 'develgpus',
+            'cluster': 'jusuf',
+            'time': '3:60:00',
+        }
+
+    def write_jobscript_for_run(self, submit=True, restart_idx=0):
+        procs = self.params['procs']
+        useGPU = self.params['useGPU']
+        partition = self.params['partition']
+        tasks_per_node = self.params['tasks_per_node']
+        cluster = self.params['cluster']
+        res = self.params['res']
+        time = self.params['time']
+
+        sbatch_options = [
+            f'-n {np.prod(procs)}',
+            f'-p {partition}',
+            f'--tasks-per-node={tasks_per_node}',
+            f'--time={time}',
+            '--mail-type=ALL',
+            f'--mail-user={self.email}',
+        ]
+        if 'reservation' in self.params.keys():
+            sbatch_options += [f'--reservation={self.params["reservation"]}']
+
+        srun_options = [f'--tasks-per-node={tasks_per_node}']
+
+        if useGPU:
+            srun_options += ['--cpus-per-task=4', '--gpus-per-task=1']
+            sbatch_options += ['--cpus-per-task=4', '--gpus-per-task=1']
+
+        procs = (''.join(f'{me}/' for me in procs))[:-1]
+        command = f'run_experiment.py --mode=run --res={res} --config={self.config} --procs={procs} -o {self.path} --restart_idx {restart_idx}'
+
+        if useGPU:
+            command += ' --useGPU=True'
+
+        write_jobscript(sbatch_options, srun_options, command, cluster, submit=submit, name=f'{self.config}')
+
+    def write_jobscript_for_plotting(self, num_procs=20, mode='plot', submit=True, restart_idx=0):
+        procs_sim = self.params['procs']
+        useGPU = self.params['useGPU']
+        res = self.params['res']
+        procs = num_procs
+        partition = self.plotting_params['partition']
+        tasks_per_node = self.plotting_params['tasks_per_node']
+        cluster = self.plotting_params['cluster']
+
+        sbatch_options = [
+            f'-n {np.prod(procs)}',
+            f'-p {partition}',
+            f'--tasks-per-node={tasks_per_node}',
+            f'--time={self.plotting_params["time"]}',
+        ]
+
+        srun_options = [f'--tasks-per-node={tasks_per_node}']
+
+        procs_sim = (''.join(f'{me}/' for me in procs_sim))[:-1]
+        command = f'run_experiment.py --mode={mode} --res={res} --config={self.config} --procs={procs_sim} -o {self.path} --restart_idx {restart_idx}'
+
+        if useGPU:
+            command += ' --useGPU=True'
+
+        write_jobscript(sbatch_options, srun_options, command, cluster, submit=submit)
+
+    def write_jobscript_for_video(self, num_procs=20, submit=True):
+        procs_sim = self.params['procs']
+        useGPU = self.params['useGPU']
+        res = self.params['res']
+        procs = num_procs
+        partition = self.plotting_params['partition']
+        tasks_per_node = self.plotting_params['tasks_per_node']
+        cluster = self.plotting_params['cluster']
+
+        sbatch_options = [
+            f'-n {np.prod(procs)}',
+            f'-p {partition}',
+            f'--tasks-per-node={tasks_per_node}',
+            f'--time={self.plotting_params["time"]}',
+        ]
+
+        srun_options = [f'--tasks-per-node={tasks_per_node}']
+
+        procs_sim = (''.join(f'{me}/' for me in procs_sim))[:-1]
+        command = f'run_experiment.py --mode=video --res={res} --config={self.config} --procs={procs_sim}'
+
+        if useGPU:
+            command += ' --useGPU=True'
+
+        write_jobscript(sbatch_options, srun_options, command, cluster, submit=submit)
+
+
+class GSLarge(LargeSim):
+    config = 'GS_large'
+
+    def setup_CPU_params(self):
+        """
+        Test params with a small run.
+        """
+        self.params = {
+            'procs': [1, 1, 4],
+            'useGPU': False,
+            'tasks_per_node': 16,
+            'partition': 'batch',
+            'cluster': 'jusuf',
+            'res': 64,
+            'time': '0:15:00',
+        }
+
+    def setup_GPU_params_test(self):
+        """
+        Test params with a small run.
+        """
+        self.params = {
+            'procs': [1, 4, 1],
+            'useGPU': True,
+            'tasks_per_node': 4,
+            'partition': 'develbooster',
+            'cluster': 'booster',
+            'res': 512,
+            'time': '0:20:00',
+        }
+
+    def setup_GPU_params(self):
+        """
+        Params we actually want to use for the large simulation
+        """
+        self.params = {
+            'procs': [1, 4, 192],
+            'useGPU': True,
+            'tasks_per_node': 4,
+            'partition': 'booster',
+            'cluster': 'booster',
+            'res': 2304,
+            'time': '1:00:00',
+        }
+
+    def setup_GPU_paramsFull(self):
+        """
+        Config for a stupid run on the whole machine.
+        """
+        self.params = {
+            'procs': [1, 4, 896],
+            'useGPU': True,
+            'tasks_per_node': 4,
+            'partition': 'largebooster',
+            'cluster': 'booster',
+            'res': 2688,
+            'time': '0:20:00',
+            'reservation': 'big-days-20241119',
+        }
+
+
+class RBCLarge(LargeSim):
+    config = 'RBC_large'
+
+    def setup_CPU_params(self):
+        """
+        Params for the large run
+        """
+        self.params = {
+            'procs': [1, 4, 1024],
+            'useGPU': False,
+            'tasks_per_node': 128,
+            'partition': 'dc-cpu',
+            'cluster': 'jureca',
+            'res': 4096,
+            'time': '4:30:00',
+        }
+
+
+if __name__ == '__main__':
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--problem', type=str, default='GS', choices=['RBC', 'GS'])
+    parser.add_argument('--XPU', type=str, choices=['CPU', 'GPU'], default='CPU')
+    parser.add_argument('--mode', type=str, choices=['run', 'plot', 'video', 'plot_series'], default='plot')
+    parser.add_argument('--submit', type=str, choices=['yes', 'no'], default='yes')
+    parser.add_argument('--num_procs', type=int, default=10)
+    parser.add_argument('--restart_idx', type=int, default=0)
+    args = parser.parse_args()
+
+    if args.problem == 'GS':
+        cls = GSLarge
+    elif args.problem == 'RBC':
+        cls = RBCLarge
+    else:
+        raise NotImplementedError
+
+    sim = cls()
+
+    if args.XPU == 'CPU':
+        sim.setup_CPU_params()
+    elif args.XPU == 'GPU':
+        sim.setup_GPU_params()
+    else:
+        raise NotImplementedError()
+
+    if args.mode == 'run':
+        sim.write_jobscript_for_run(submit=args.submit, restart_idx=args.restart_idx)
+    elif args.mode == 'plot':
+        sim.write_jobscript_for_plotting(num_procs=args.num_procs, submit=args.submit, restart_idx=args.restart_idx)
+    elif args.mode == 'plot_series':
+        sim.write_jobscript_for_plotting(num_procs=1, submit=args.submit, mode='plot_series')
+    elif args.mode == 'video':
+        sim.write_jobscript_for_video(num_procs=args.num_procs, submit=args.submit)
+    else:
+        raise NotImplementedError()
diff --git a/pySDC/projects/GPU/analysis_scripts/parallel_scaling.py b/pySDC/projects/GPU/analysis_scripts/parallel_scaling.py
index 6358ee66dc6664e59e0b452d3cbdf29c0da171f8..fe190bd6f489ef7c00d72a7a211915a4366910ac 100644
--- a/pySDC/projects/GPU/analysis_scripts/parallel_scaling.py
+++ b/pySDC/projects/GPU/analysis_scripts/parallel_scaling.py
@@ -9,104 +9,185 @@ from pySDC.helpers.plot_helper import setup_mpl, figsize_by_journal
 setup_mpl()
 
 
+class Experiment(object):
+    def __init__(self, res, PinT=False, start=0, stop=-1, sequence=None, marker='x'):
+        self.res = res
+        self.PinT = PinT
+        self.start = start
+        self.stop = stop
+        self._sequence = sequence
+        self.marker = marker
+
+    @property
+    def sequence(self):
+        if self._sequence is not None:
+            return self._sequence
+        else:
+            sequence = []
+            i = self.start
+            while i <= self.stop:
+                sequence += [i]
+                i *= 2
+            return sequence
+
+
 class ScalingConfig(object):
     cluster = None
     config = ''
-    base_resolution = -1
-    base_resolution_weak = -1
     useGPU = False
     partition = None
     tasks_per_node = None
     ndim = 2
     tasks_time = 1
-    max_steps_space = None
-    max_steps_space_weak = None
     sbatch_options = []
-    max_nodes = 9999
-
-    def __init__(self, space_time_parallel):
-        if space_time_parallel in ['False', False]:
-            self._tasks_time = 1
-        else:
-            self._tasks_time = self.tasks_time
+    experiments = []
+    OMP_NUM_THREADS = 1
+
+    def format_resolution(self, resolution):
+        return f'${{{resolution}}}^{{{self.ndim}}}$'
+
+    def run_scaling_test(self, **kwargs):
+        for experiment in self.experiments:
+            res = experiment.res
+
+            tasks_time = self.tasks_time if experiment.PinT else 1
+
+            for i in experiment.sequence:
+                procs = [1, tasks_time, int(np.ceil(i / tasks_time))]
+
+                sbatch_options = [
+                    f'-n {np.prod(procs)}',
+                    f'-p {self.partition}',
+                    f'--tasks-per-node={self.tasks_per_node}',
+                ] + self.sbatch_options
+                srun_options = [f'--tasks-per-node={self.tasks_per_node}']
+                if self.useGPU:
+                    srun_options += [f'--cpus-per-task={self.OMP_NUM_THREADS}', '--gpus-per-task=1']
+                    sbatch_options += [f'--cpus-per-task={self.OMP_NUM_THREADS}', '--gpus-per-task=1']
+
+                procs = (''.join(f'{me}/' for me in procs))[:-1]
+                command = f'run_experiment.py --mode=run --res={res} --config={self.config} --procs={procs}'
+
+                if self.useGPU:
+                    command += ' --useGPU=True'
+
+                write_jobscript(
+                    sbatch_options,
+                    srun_options,
+                    command,
+                    self.cluster,
+                    name=f'{type(self).__name__}_{res}',
+                    OMP_NUM_THREADS=self.OMP_NUM_THREADS,
+                    **kwargs,
+                )
 
-    def get_resolution_and_tasks(self, strong, i):
-        if strong:
-            return self.base_resolution, [1, self._tasks_time, 2**i]
+    def plot_scaling_test(self, ax, quantity='time', **plotting_params):  # pragma: no cover
+        from matplotlib.colors import TABLEAU_COLORS
+
+        cmap = TABLEAU_COLORS
+        colors = list(cmap.values())
+
+        for experiment in self.experiments:
+            tasks_time = self.tasks_time if experiment.PinT else 1
+            timings = {}
+
+            plotting_params = {
+                (False, False): {
+                    'ls': '--',
+                    'label': f'CPU {self.format_resolution(experiment.res)}',
+                    'color': colors[2],
+                },
+                (False, True): {
+                    'ls': '-.',
+                    'label': f'CPU PinT {self.format_resolution(experiment.res)}',
+                    'color': colors[3],
+                },
+                (True, False): {
+                    'ls': '-',
+                    'label': f'GPU {self.format_resolution(experiment.res)}',
+                    'color': colors[0],
+                },
+                (True, True): {
+                    'ls': ':',
+                    'label': f'GPU PinT {self.format_resolution(experiment.res)}',
+                    'color': colors[1],
+                },
+            }
+
+            i = experiment.start
+            res = experiment.res
+
+            for i in experiment.sequence:
+                procs = [1, tasks_time, int(np.ceil(i / tasks_time))]
+                args = {'useGPU': self.useGPU, 'config': self.config, 'res': res, 'procs': procs, 'mode': None}
+
+                config = get_config(args)
+
+                path = f'data/{config.get_path(ranks=[me -1 for me in procs])}-stats-whole-run.pickle'
+                try:
+                    with open(path, 'rb') as file:
+                        stats = pickle.load(file)
+
+                    if args['useGPU']:
+                        timing_step = get_sorted(stats, type='GPU_timing_step')
+                    else:
+                        timing_step = get_sorted(stats, type='timing_step')
+
+                    t_mean = np.mean([me[1] for me in timing_step])
+                    t_min = np.min([me[1] for me in timing_step][1:])
+
+                    if quantity == 'throughput':
+                        timings[np.prod(procs) / self.tasks_per_node] = experiment.res**self.ndim / t_mean
+                    elif quantity == 'throughput_per_task':
+                        timings[np.prod(procs)] = experiment.res**self.ndim / t_mean
+                    elif quantity == 'efficiency':
+                        timings[np.prod(procs) / self.tasks_per_node] = (
+                            experiment.res**self.ndim / t_mean / np.prod(procs)
+                        )
+                    elif quantity == 'time':
+                        timings[np.prod(procs) / self.tasks_per_node] = t_mean
+                    elif quantity == 'time_per_task':
+                        timings[np.prod(procs)] = t_mean
+                    elif quantity == 'min_time_per_task':
+                        timings[np.prod(procs)] = t_min
+                    else:
+                        raise NotImplementedError
+                except (FileNotFoundError, ValueError):
+                    pass
+
+            ax.loglog(
+                timings.keys(),
+                timings.values(),
+                **plotting_params[(self.useGPU, experiment.PinT)],
+                marker=experiment.marker,
+            )
+        if 'per_task' in quantity:
+            ax.set_xlabel(r'$N_\mathrm{procs}$')
         else:
-            return self.base_resolution_weak * int(self._tasks_time ** (1.0 / self.ndim)) * (2**i), [
-                1,
-                self._tasks_time,
-                (2 * self.ndim) ** i,
-            ]
-
-    def run_scaling_test(self, strong=True):
-        max_steps = self.max_steps_space if strong else self.max_steps_space_weak
-        for i in range(max_steps):
-            res, procs = self.get_resolution_and_tasks(strong, i)
-
-            _nodes = np.prod(procs) // self.tasks_per_node
-            if _nodes > self.max_nodes:
-                break
-
-            sbatch_options = [
-                f'-n {np.prod(procs)}',
-                f'-p {self.partition}',
-                f'--tasks-per-node={self.tasks_per_node}',
-            ] + self.sbatch_options
-            srun_options = [f'--tasks-per-node={self.tasks_per_node}']
-            if self.useGPU:
-                srun_options += ['--cpus-per-task=4', '--gpus-per-task=1']
-                sbatch_options += ['--cpus-per-task=4', '--gpus-per-task=1']
-
-            procs = (''.join(f'{me}/' for me in procs))[:-1]
-            command = f'run_experiment.py --mode=run --res={res} --config={self.config} --procs={procs}'
-
-            if self.useGPU:
-                command += ' --useGPU=True'
-
-            write_jobscript(sbatch_options, srun_options, command, self.cluster)
-
-    def plot_scaling_test(self, strong, ax, plot_ideal=False, **plotting_params):  # pragma: no cover
-        timings = {}
-
-        max_steps = self.max_steps_space if strong else self.max_steps_space_weak
-        for i in range(max_steps):
-            res, procs = self.get_resolution_and_tasks(strong, i)
-
-            args = {'useGPU': self.useGPU, 'config': self.config, 'res': res, 'procs': procs, 'mode': None}
-
-            config = get_config(args)
-
-            path = f'data/{config.get_path(ranks=[me -1 for me in procs])}-stats-whole-run.pickle'
-            try:
-                with open(path, 'rb') as file:
-                    stats = pickle.load(file)
-
-                timing_step = get_sorted(stats, type='timing_step')
-                timings[np.prod(procs) / self.tasks_per_node] = np.mean([me[1] for me in timing_step])
-            except FileNotFoundError:
-                pass
-
-        if plot_ideal:
-            if strong:
-                ax.loglog(
-                    timings.keys(),
-                    list(timings.values())[0] * list(timings.keys())[0] / np.array(list(timings.keys())),
-                    ls='--',
-                    color='grey',
-                    label='ideal',
-                )
-        ax.loglog(timings.keys(), timings.values(), **plotting_params)
-        ax.set_xlabel(r'$N_\mathrm{nodes}$')
-        ax.set_ylabel(r'$t_\mathrm{step}$')
+            ax.set_xlabel(r'$N_\mathrm{nodes}$')
+        labels = {
+            'throughput': 'throughput / DoF/s',
+            'throughput_per_task': 'throughput / DoF/s',
+            'time': r'$t_\mathrm{step}$ / s',
+            'time_per_task': r'$t_\mathrm{step}$ / s',
+            'min_time_per_task': r'minimal $t_\mathrm{step}$ / s',
+            'efficiency': 'efficiency / DoF/s/task',
+        }
+        ax.set_ylabel(labels[quantity])
 
 
 class CPUConfig(ScalingConfig):
-    cluster = 'jusuf'
+    # cluster = 'jusuf'
+    cluster = 'juwels'
     partition = 'batch'
     tasks_per_node = 16
-    max_nodes = 144
+
+
+class JurecaCPU(ScalingConfig):
+    cluster = 'jureca'
+    partition = 'dc-cpu'
+    tasks_per_node = 64
+    OMP_NUM_THREADS = 1
 
 
 class GPUConfig(ScalingConfig):
@@ -114,82 +195,240 @@ class GPUConfig(ScalingConfig):
     partition = 'booster'
     tasks_per_node = 4
     useGPU = True
-    max_nodes = 936
+    OMP_NUM_THREADS = 12
 
 
-class GrayScottSpaceScalingCPU(CPUConfig, ScalingConfig):
-    base_resolution = 8192
-    base_resolution_weak = 512
-    config = 'GS_scaling'
-    max_steps_space = 11
-    max_steps_space_weak = 11
+class GrayScottSpaceScalingCPU3D(CPUConfig, ScalingConfig):
+    ndim = 3
+    config = 'GS_scaling3D'
     tasks_time = 4
-    sbatch_options = ['--time=3:30:00']
+    tasks_per_node = 16
+    sbatch_options = ['--time=0:45:00']
+    experiments = [
+        Experiment(res=512, PinT=False, start=1, stop=256, marker='>'),
+        Experiment(res=512, PinT=True, start=1, stop=1024, marker='>'),
+        Experiment(res=1024, PinT=False, start=256, stop=512, marker='x'),
+        Experiment(res=1024, PinT=True, start=256, stop=2048, marker='x'),
+        # {'res': 2304, 'PinT': True, 'start': 768, 'stop': 6144, 'marker': '.'},
+        # {'res': 2304, 'PinT': False, 'start': 768, 'stop': 6144, 'marker': '.'},
+    ]
+
+
+class GrayScottSpaceScalingGPU3D(GPUConfig, ScalingConfig):
+    ndim = 3
+    config = 'GS_scaling3D'
+    tasks_time = 4
+    sbatch_options = ['--time=0:07:00']
+
+    experiments = [
+        Experiment(res=512, PinT=True, start=1, stop=512, marker='>'),
+        Experiment(res=512, PinT=False, start=1, stop=256, marker='>'),
+        Experiment(res=1024, PinT=True, start=16, stop=512, marker='x'),
+        Experiment(res=1024, PinT=False, start=16, stop=1024, marker='x'),
+        Experiment(res=2304, PinT=True, start=768, stop=1536, marker='.'),
+        Experiment(res=2304, PinT=False, start=192, stop=768, marker='.'),
+        # Experiment(res=4608, PinT=False, start=1536, stop=1536, marker='o'),
+        Experiment(res=4480, PinT=True, sequence=[3584], marker='o'),
+        Experiment(res=4480, PinT=False, sequence=[1494], marker='o'),
+    ]
+
+
+class RBCBaseConfig(ScalingConfig):
+    def format_resolution(self, resolution):
+        return fr'${{{resolution}}}\times{{{resolution//4}}}$'
 
 
-class GrayScottSpaceScalingGPU(GPUConfig, ScalingConfig):
-    base_resolution_weak = 1024
-    base_resolution = 8192
-    config = 'GS_scaling'
-    max_steps_space = 7
-    max_steps_space_weak = 5
+class RayleighBenardSpaceScalingCPU(JurecaCPU, RBCBaseConfig):
+    ndim = 2
+    config = 'RBC_scaling'
     tasks_time = 4
-    max_nodes = 64
+    sbatch_options = ['--time=0:15:00']
+    tasks_per_node = 64
+    OMP_NUM_THREADS = 1
+
+    experiments = [
+        Experiment(res=1024, PinT=False, start=1, stop=128, marker='.'),
+        Experiment(res=1024, PinT=True, start=4, stop=1024, marker='.'),
+        Experiment(res=2048, PinT=False, start=32, stop=256, marker='x'),
+        Experiment(res=2048, PinT=True, start=128, stop=1024, marker='x'),
+        Experiment(res=4096, PinT=False, start=256, stop=1024, marker='o'),
+        Experiment(res=4096, PinT=True, start=1024, stop=4096, marker='o'),
+        # Experiment(res=8192, PinT=False, sequence=[2048], marker='o'),
+        # Experiment(res=16384, PinT=False, sequence=[4096], marker='o'),
+    ]
+
+
+class RayleighBenardSpaceScalingCPULarge(RayleighBenardSpaceScalingCPU):
+    tasks_per_node = 32
 
+    experiments = [
+        Experiment(res=8192, PinT=False, sequence=[512, 1024, 2048], marker='o'),
+        # Experiment(res=16384, PinT=False, sequence=[4096], marker='o'),
+    ]
 
-def plot_scalings(strong, problem, kwargs):  # pragma: no cover
-    if problem == 'GS':
-        fig, ax = plt.subplots(figsize=figsize_by_journal('JSC_beamer', 1, 0.45))
 
-        plottings_params = [
-            {'plot_ideal': True, 'marker': 'x', 'label': 'CPU space parallel'},
-            {'marker': '>', 'label': 'CPU space time parallel'},
-            {'marker': '^', 'label': 'GPU space parallel'},
-            {'marker': '<', 'label': 'GPU space time parallel'},
+class RayleighBenardSpaceScalingGPU(GPUConfig, RBCBaseConfig):
+    ndim = 2
+    config = 'RBC_scaling'
+    tasks_time = 4
+    sbatch_options = ['--time=0:15:00']
+
+    experiments = [
+        Experiment(res=1024, PinT=False, start=1, stop=32, marker='.'),
+        Experiment(res=1024, PinT=True, start=4, stop=128, marker='.'),
+        Experiment(res=2048, PinT=False, start=16, stop=256, marker='x'),
+        Experiment(res=2048, PinT=True, start=16, stop=256, marker='x'),
+    ]
+
+
+# class RayleighBenardSpaceScalingCPU(CPUConfig, ScalingConfig):
+#     base_resolution = 1024
+#     base_resolution_weak = 128
+#     config = 'RBC_scaling'
+#     max_steps_space = 13
+#     max_steps_space_weak = 10
+#     tasks_time = 4
+#     max_nodes = 64
+#     # sbatch_options = ['--time=3:30:00']
+#     max_tasks = 256
+#
+#
+# class RayleighBenardSpaceScalingGPU(GPUConfig, ScalingConfig):
+#     base_resolution_weak = 256
+#     base_resolution = 1024
+#     config = 'RBC_scaling'
+#     max_steps_space = 9
+#     max_steps_space_weak = 9
+#     tasks_time = 4
+#     max_tasks = 256
+#     sbatch_options = ['--time=0:15:00']
+#     max_nodes = 64
+
+
+class RayleighBenardDedalusComparison(CPUConfig, RBCBaseConfig):
+    cluster = 'jusuf'
+    tasks_per_node = 64
+    base_resolution = 256
+    config = 'RBC_Tibo'
+    max_steps_space = 6
+    tasks_time = 4
+    experiments = [
+        Experiment(res=256, PinT=False, start=1, stop=64, marker='.'),
+        # Experiment(res=256, PinT=True, start=4, stop=256, marker='.'),
+    ]
+
+
+class RayleighBenardDedalusComparisonGPU(GPUConfig, ScalingConfig):
+    base_resolution_weak = 256
+    base_resolution = 256
+    config = 'RBC_Tibo'
+    max_steps_space = 4
+    max_steps_space_weak = 4
+    tasks_time = 4
+    experiments = [
+        Experiment(res=256, PinT=False, start=1, stop=1, marker='.'),
+        # Experiment(res=256, PinT=True, start=4, stop=256, marker='.'),
+    ]
+
+
+def plot_scalings(problem, **kwargs):  # pragma: no cover
+    if problem == 'GS3D':
+        configs = [
+            GrayScottSpaceScalingCPU3D(),
+            GrayScottSpaceScalingGPU3D(),
+        ]
+    elif problem == 'RBC':
+        configs = [
+            RayleighBenardSpaceScalingGPU(),
+            RayleighBenardSpaceScalingCPU(),
         ]
+    elif problem == 'RBC_dedalus':
         configs = [
-            GrayScottSpaceScalingCPU(space_time_parallel=False),
-            GrayScottSpaceScalingCPU(space_time_parallel=True),
-            GrayScottSpaceScalingGPU(space_time_parallel=False),
-            GrayScottSpaceScalingGPU(space_time_parallel=True),
+            RayleighBenardDedalusComparison(),
+            RayleighBenardDedalusComparisonGPU(),
         ]
 
-        for config, params in zip(configs, plottings_params):
-            config.plot_scaling_test(strong=strong, ax=ax, **params)
-        ax.legend(frameon=False)
-        fig.savefig(f'{PROJECT_PATH}/plots/{"strong" if strong else "weak"}_scaling_{problem}.pdf', bbox_inches='tight')
     else:
         raise NotImplementedError
 
+    ideal_lines = {
+        ('GS3D', 'throughput'): {'x': [0.25, 400], 'y': [5e6, 8e9]},
+        ('GS3D', 'time'): {'x': [0.25, 400], 'y': [80, 5e-2]},
+        ('RBC', 'throughput'): {'x': [1 / 10, 64], 'y': [2e4, 2e4 * 640]},
+        ('RBC', 'time'): {'x': [1 / 10, 64], 'y': [60, 60 / 640]},
+        ('RBC', 'time_per_task'): {'x': [1, 640], 'y': [60, 60 / 640]},
+        ('RBC', 'min_time_per_task'): {'x': [1, 640], 'y': [60, 60 / 640]},
+        ('RBC', 'throughput_per_task'): {'x': [1 / 1, 640], 'y': [2e4, 2e4 * 640]},
+    }
+
+    fig, ax = plt.subplots(figsize=figsize_by_journal('TUHH_thesis', 1, 0.6))
+    configs[1].plot_scaling_test(ax=ax, quantity='efficiency')
+    # ax.legend(frameon=False)
+    box = ax.get_position()
+    ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
+    ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
+
+    ax.set_yscale('linear')
+    path = f'{PROJECT_PATH}/plots/scaling_{problem}_efficiency.pdf'
+    fig.savefig(path, bbox_inches='tight')
+    print(f'Saved {path!r}', flush=True)
+
+    for quantity in ['time', 'throughput', 'time_per_task', 'throughput_per_task', 'min_time_per_task'][::-1]:
+        fig, ax = plt.subplots(figsize=figsize_by_journal('TUHH_thesis', 1, 0.6))
+        for config in configs:
+            config.plot_scaling_test(ax=ax, quantity=quantity)
+        if (problem, quantity) in ideal_lines.keys():
+            ax.loglog(*ideal_lines[(problem, quantity)].values(), color='black', ls=':', label='ideal')
+        box = ax.get_position()
+        ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
+        ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
+        path = f'{PROJECT_PATH}/plots/scaling_{problem}_{quantity}.pdf'
+        fig.savefig(path, bbox_inches='tight')
+        print(f'Saved {path!r}', flush=True)
+
 
 if __name__ == '__main__':
     import argparse
 
     parser = argparse.ArgumentParser()
-    parser.add_argument('--scaling', type=str, choices=['strong', 'weak'], default='strong')
     parser.add_argument('--mode', type=str, choices=['run', 'plot'], default='run')
     parser.add_argument('--problem', type=str, default='GS')
     parser.add_argument('--XPU', type=str, choices=['CPU', 'GPU'], default='CPU')
     parser.add_argument('--space_time', type=str, choices=['True', 'False'], default='False')
+    parser.add_argument('--submit', type=str, choices=['True', 'False'], default='True')
+    parser.add_argument('--nsys_profiling', type=str, choices=['True', 'False'], default='False')
 
     args = parser.parse_args()
 
-    strong = args.scaling == 'strong'
+    submit = args.submit == 'True'
+    nsys_profiling = args.nsys_profiling == 'True'
+
+    config_classes = []
 
-    if args.problem == 'GS':
+    if args.problem == 'GS3D':
+        if args.XPU == 'CPU':
+            config_classes += [GrayScottSpaceScalingCPU3D]
+        else:
+            config_classes += [GrayScottSpaceScalingGPU3D]
+    elif args.problem == 'RBC':
         if args.XPU == 'CPU':
-            configClass = GrayScottSpaceScalingCPU
+            config_classes += [RayleighBenardSpaceScalingCPU]
         else:
-            configClass = GrayScottSpaceScalingGPU
+            config_classes += [RayleighBenardSpaceScalingGPU]
+    elif args.problem == 'RBC_dedalus':
+        if args.XPU == 'CPU':
+            config_classes += [RayleighBenardDedalusComparison]
+        else:
+            config_classes += [RayleighBenardDedalusComparisonGPU]
     else:
         raise NotImplementedError(f'Don\'t know problem {args.problem!r}')
 
-    kwargs = {'space_time_parallel': args.space_time}
-    config = configClass(**kwargs)
+    for configClass in config_classes:
+        config = configClass()
 
-    if args.mode == 'run':
-        config.run_scaling_test(strong=strong)
-    elif args.mode == 'plot':
-        plot_scalings(strong=strong, problem=args.problem, kwargs=kwargs)
-    else:
-        raise NotImplementedError(f'Don\'t know mode {args.mode!r}')
+        if args.mode == 'run':
+            config.run_scaling_test(submit=submit, nsys_profiling=nsys_profiling)
+        elif args.mode == 'plot':
+            plot_scalings(problem=args.problem)
+        else:
+            raise NotImplementedError(f'Don\'t know mode {args.mode!r}')
diff --git a/pySDC/projects/GPU/analysis_scripts/plot_RBC_matrix.py b/pySDC/projects/GPU/analysis_scripts/plot_RBC_matrix.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ae2743406ab2ad19000190d6a011cebbb6f9efe
--- /dev/null
+++ b/pySDC/projects/GPU/analysis_scripts/plot_RBC_matrix.py
@@ -0,0 +1,96 @@
+from pySDC.implementations.problem_classes.RayleighBenard import RayleighBenard
+import numpy as np
+import matplotlib.pyplot as plt
+from pySDC.helpers.plot_helper import figsize_by_journal, setup_mpl
+
+
+def plot_preconditioners():  # pragma: no cover
+    P = RayleighBenard(nx=3, nz=5, Dirichlet_recombination=True, left_preconditioner=True)
+    dt = 1.0
+
+    A = P.M + dt * P.L
+    A_b = P.put_BCs_in_matrix(P.M + dt * P.L)
+    A_r = P.spectral.put_BCs_in_matrix(A) @ P.Pr
+    A_l = P.Pl @ P.spectral.put_BCs_in_matrix(A) @ P.Pr
+
+    fig, axs = plt.subplots(1, 4, figsize=figsize_by_journal('TUHH_thesis', 1, 0.4), sharex=True, sharey=True)
+
+    for M, ax in zip([A, A_b, A_r, A_l], axs):
+        ax.imshow((M / abs(M)).real + (M / abs(M)).imag, rasterized=False, cmap='Spectral')
+
+    for ax in axs:
+        ax.set_xticks([])
+        ax.set_yticks([])
+    fig.savefig('plots/RBC_matrix.pdf', bbox_inches='tight', dpi=300)
+    plt.show()
+
+
+def plot_ultraspherical():  # pragma: no cover
+    from pySDC.helpers.spectral_helper import ChebychevHelper, UltrasphericalHelper
+
+    N = 16
+    cheby = ChebychevHelper(N=N)
+    ultra = UltrasphericalHelper(N=N)
+
+    D_cheby = cheby.get_differentiation_matrix()
+    I_cheby = cheby.get_Id()
+
+    fig, axs = plt.subplots(2, 3, figsize=figsize_by_journal('TUHH_thesis', 0.9, 0.65), sharex=True, sharey=True)
+
+    axs[0, 0].imshow(D_cheby / abs(D_cheby))
+    axs[1, 0].imshow(I_cheby / abs(I_cheby))
+
+    for i in range(2):
+        D_ultra = ultra.get_differentiation_matrix(p=i + 1)
+        I_ultra = ultra.get_basis_change_matrix(0, i + 1)
+        axs[0, i + 1].imshow(D_ultra / abs(D_ultra))
+        axs[1, i + 1].imshow(I_ultra / abs(I_ultra))
+        axs[1, i + 1].set_xlabel(rf'$T \rightarrow C^{{({{{i+1}}})}}$')
+    for ax in axs.flatten():
+        ax.set_xticks([])
+        ax.set_yticks([])
+
+    axs[0, 0].set_ylabel('Differentiation')
+    axs[1, 0].set_ylabel('Left preconditioner')
+    axs[1, 0].set_xlabel(r'$T \rightarrow T$')
+
+    axs[0, 0].set_title('first derivative')
+    axs[0, 1].set_title('first derivative')
+    axs[0, 2].set_title('second derivative')
+    fig.savefig('plots/ultraspherical_matrix.pdf', bbox_inches='tight', dpi=300)
+    plt.show()
+
+
+def plot_DCT():  # pragma: no cover
+    fig, axs = plt.subplots(1, 3, figsize=figsize_by_journal('TUHH_thesis', 1, 0.28), sharey=True)
+
+    N = 8
+    color = 'black'
+
+    x = np.linspace(0, 3, N)
+    y = x**3 - 4 * x**2
+    axs[0].plot(y, marker='o', color=color)
+
+    y_m = np.append(y, y[::-1])
+    axs[1].scatter(np.arange(2 * N)[::2], y_m[::2], marker='<', color=color)
+    axs[1].scatter(np.arange(2 * N)[1::2], y_m[1::2], marker='>', color=color)
+    axs[1].plot(np.arange(2 * N), y_m, color=color)
+
+    v = y_m[::2]
+    axs[2].plot(np.arange(N), v, color=color, marker='x')
+
+    axs[0].set_title('original')
+    axs[1].set_title('mirrored')
+    axs[2].set_title('periodically reordered')
+
+    for ax in axs:
+        # ax.set_xlabel(r'$n$')
+        ax.set_yticks([])
+    fig.savefig('plots/DCT_via_FFT.pdf', bbox_inches='tight', dpi=300)
+
+
+if __name__ == '__main__':
+    setup_mpl()
+    plot_DCT()
+    # plot_ultraspherical()
+    plt.show()
diff --git a/pySDC/projects/GPU/analysis_scripts/plot_large_simulations.py b/pySDC/projects/GPU/analysis_scripts/plot_large_simulations.py
new file mode 100644
index 0000000000000000000000000000000000000000..8296007e3844e2d677d39ba66338addbb465eecd
--- /dev/null
+++ b/pySDC/projects/GPU/analysis_scripts/plot_large_simulations.py
@@ -0,0 +1,364 @@
+import pickle
+import numpy as np
+import matplotlib.pyplot as plt
+from pySDC.helpers.stats_helper import get_sorted, get_list_of_types
+from pySDC.helpers.plot_helper import figsize_by_journal, setup_mpl
+from mpi4py import MPI
+
+comm = MPI.COMM_WORLD
+
+try:
+    from tqdm import tqdm
+except ModuleNotFoundError:
+    tqdm = None
+
+
+class PlotLargeRun:  # pragma: no cover
+    name = None
+
+    def __init__(self, res, procs, base_path, max_frames=500, useGPU='False'):
+        self.res = res
+        self.procs = procs
+        self.base_path = base_path
+        self.max_frames = max_frames
+        self.useGPU = useGPU
+
+        self._stats = None
+        self._prob = None
+
+    def get_stats(self):
+        self._stats = {}
+
+        frame_range = range(self.max_frames)
+        if tqdm:
+            frame_range = tqdm(frame_range)
+            frame_range.set_description('Loading files')
+
+        for idx in frame_range:
+            for n_time in range(1):
+                try:
+                    path = self.get_path(idx=idx, n_space=0, n_time=n_time, stats=True)
+                    with open(path, 'rb') as file:
+                        self._stats = {**self._stats, **pickle.load(file)}
+                except FileNotFoundError:
+                    continue
+
+    @property
+    def stats(self):
+        if self._stats is None:
+            self.get_stats()
+        return self._stats
+
+    def get_fig(self, *args, **kwargs):
+        return plt.subplots(*args, figsize=figsize_by_journal('TUHH_thesis', 0.8, 0.6), **kwargs)
+
+    def save_fig(self, fig, name):
+        path = f'plots/{self.name}_{name}.pdf'
+        fig.savefig(path, bbox_inches='tight', dpi=300)
+        print(f'Saved {path!r}', flush=True)
+
+    @property
+    def prob(self):
+        if self._prob is None:
+            self._prob = self.get_problem()
+        return self._prob
+
+    def plot_work(self):  # pragma: no cover
+        fig, ax = self.get_fig()
+        for key, label in zip(['factorizations', 'rhs'], ['LU decompositions', 'rhs evaluations']):
+            work = get_sorted(self.stats, type=f'work_{key}')
+            ax.plot([me[0] for me in work], np.cumsum([4 * me[1] for me in work]), label=fr'\#{label}')
+        ax.set_yscale('log')
+        ax.set_xlabel('$t$')
+        ax.legend(frameon=False)
+        self.save_fig(fig, 'work')
+
+    def plot_residual(self):  # pragma: no cover
+        fig, ax = self.get_fig()
+        residual = get_sorted(self.stats, type='residual_post_step', recomputed=False)
+        increment = get_sorted(self.stats, type='error_embedded_estimate', recomputed=False)
+        ax.plot([me[0] for me in residual], [me[1] for me in residual], label=r'residual')
+        ax.plot([me[0] for me in increment], [me[1] for me in increment], label=r'$\epsilon$')
+        ax.set_yscale('log')
+        ax.set_xlabel('$t$')
+        ax.legend(frameon=False)
+        self.save_fig(fig, 'residual')
+
+    def get_problem(self):
+        raise NotImplementedError()
+
+
+class PlotRBC(PlotLargeRun):  # pragma: no cover
+    name = 'RBC_large'
+
+    def get_path(self, idx, n_space, n_time=0, stats=False):
+        _name = '-stats' if stats else ''
+        return f'{self.base_path}/data/RayleighBenard_large-res_{self.res}-useGPU_{self.useGPU}-procs_{self.procs[0]}_{self.procs[1]}_{self.procs[2]}-{n_time}-{n_space}-solution_{idx:06d}{_name}.pickle'
+
+    def plot_verification(self):  # pragma: no cover
+        fig, ax = self.get_fig()
+
+        nu = get_sorted(self.stats, type='Nusselt', recomputed=False)
+        for key in ['t', 'b', 'V']:
+            ax.plot([me[0] for me in nu], [me[1][key] for me in nu], label=fr'$Nu_\mathrm{{{key}}}$')
+        ax.legend(frameon=False)
+        ax.set_xlabel('$t$')
+        self.save_fig(fig, 'verification')
+
+    def plot_step_size(self):  # pragma: no cover
+        fig, ax = self.get_fig()
+        dt = get_sorted(self.stats, type='dt', recomputed=False)
+        CFL = self.get_CFL_limit()
+
+        ax.plot([me[0] for me in dt], [me[1] for me in dt], label=r'$\Delta t$')
+        ax.plot(CFL.keys(), CFL.values(), label='CFL limit')
+        ax.set_yscale('log')
+        ax.set_xlabel('$t$')
+        ax.set_ylabel(r'$\Delta t$')
+        ax.legend(frameon=False)
+        self.save_fig(fig, 'dt')
+
+    def get_problem(self):
+        from pySDC.projects.GPU.configs.RBC_configs import get_config
+        from pySDC.implementations.problem_classes.RayleighBenard import RayleighBenard
+
+        config = get_config(
+            {
+                'config': 'RBC_large',
+                'procs': [1, 1, self.procs[2]],
+                'res': self.res,
+                'mode': None,
+                'useGPU': False,
+            }
+        )
+        desc = config.get_description(res=self.res)
+
+        return RayleighBenard(**{**desc['problem_params'], 'comm': comm})
+
+    def _compute_CFL_single_frame(self, frame):
+        from pySDC.implementations.problem_classes.RayleighBenard import CFLLimit
+
+        u = self.prob.u_init
+
+        path = self.get_path(idx=frame, n_space=comm.rank)
+        with open(path, 'rb') as file:
+            data = pickle.load(file)
+
+        for i in range(u.shape[0]):
+            u[i] = data['u'][i]
+
+        CFL = CFLLimit.compute_max_step_size(self.prob, u)
+        return {'CFL': CFL, 't': data['t']}
+
+    def compute_CFL_limit(self):
+        frame_range = range(self.max_frames)
+
+        if tqdm and comm.rank == 0:
+            frame_range = tqdm(frame_range)
+            frame_range.set_description('Computing CFL')
+
+        CFL = {}
+        for frame in frame_range:
+            try:
+                _cfl = self._compute_CFL_single_frame(frame)
+                CFL[_cfl['t']] = _cfl['CFL']
+            except FileNotFoundError:
+                pass
+
+        if comm.rank == 0:
+            with open(self._get_CFL_limit_path(), 'wb') as file:
+                pickle.dump(CFL, file)
+            print(f'Stored {self._get_CFL_limit_path()!r}')
+        return CFL
+
+    def _get_CFL_limit_path(self):
+        return f'{self.base_path}/data/RayleighBenard_large-res_{self.res}-useGPU_{self.useGPU}-procs_{self.procs[0]}_{self.procs[1]}_{self.procs[2]}-CFL_limit.pickle'
+
+    def get_CFL_limit(self, recompute=False):
+        import os
+
+        path = self._get_CFL_limit_path()
+
+        if os.path.exists(path) and not recompute:
+            with open(path, 'rb') as file:
+                CFL = pickle.load(file)
+        else:
+            CFL = self.compute_CFL_limit()
+            with open(path, 'wb') as file:
+                pickle.dump(CFL, file)
+            print(f'Stored {path!r}')
+        return CFL
+
+    def plot_work(self):  # pragma: no cover
+        fig, ax = self.get_fig()
+        for key, label in zip(['factorizations', 'rhs'], ['LU decompositions', 'rhs evaluations']):
+            work = get_sorted(self.stats, type=f'work_{key}')
+            ax.plot([me[0] for me in work], np.cumsum([4 * me[1] for me in work]), label=fr'\#{label}')
+        ax.set_yscale('log')
+        ax.set_xlabel('$t$')
+        ax.legend(frameon=False)
+        self.save_fig(fig, 'work')
+
+    def plot_residual(self):  # pragma: no cover
+        fig, ax = self.get_fig()
+        residual = get_sorted(self.stats, type='residual_post_step', recomputed=False)
+        increment = get_sorted(self.stats, type='error_embedded_estimate', recomputed=False)
+        ax.plot([me[0] for me in residual], [me[1] for me in residual], label=r'residual')
+        ax.plot([me[0] for me in increment], [me[1] for me in increment], label=r'$\epsilon$')
+        ax.set_yscale('log')
+        ax.set_xlabel('$t$')
+        ax.legend(frameon=False)
+        self.save_fig(fig, 'residual')
+
+    def plot_series(self):  # pragma: no cover
+        indices = [0, 56, 82, 100, 162, 186]
+
+        from mpl_toolkits.axes_grid1 import make_axes_locatable
+
+        plt.rcParams['figure.constrained_layout.use'] = True
+        fig, axs = plt.subplots(
+            len(indices), 1, figsize=figsize_by_journal('TUHH_thesis', 1, 1.3), sharex=True, sharey=True
+        )
+        caxs = {}
+        for i in range(len(indices)):
+            divider = make_axes_locatable(axs[i])
+            caxs[axs[i]] = divider.append_axes('right', size='3%', pad=0.03)
+
+        # get grid
+        X = {}
+        Z = {}
+
+        frame_range = range(self.procs[2])
+        if tqdm:
+            frame_range = tqdm(frame_range)
+            frame_range.set_description('Loading grid')
+
+        for r in frame_range:
+            path = self.get_path(idx=0, n_space=r)
+            with open(path, 'rb') as file:
+                data = pickle.load(file)
+            X[r] = data['X']
+            Z[r] = data['Z']
+
+        if tqdm:
+            frame_range.set_description('Plotting slice')
+
+        def plot_single(idx, ax):  # pragma: no cover
+            for r in frame_range:
+                path = self.get_path(idx=idx, n_space=r)
+                with open(path, 'rb') as file:
+                    data = pickle.load(file)
+                im = ax.pcolormesh(X[r], Z[r], data['u'][2], vmin=0, vmax=2, cmap='plasma', rasterized=True), data['t']
+            return im
+
+        for i, ax in zip(indices, axs):
+            im, t = plot_single(i, ax)
+            fig.colorbar(im, caxs[ax], label=f'$T(t={{{t:.1f}}})$')
+
+        axs[-1].set_xlabel('$x$')
+        axs[-1].set_ylabel('$z$')
+        self.save_fig(fig, 'series')
+
+
+class PlotGS(PlotLargeRun):  # pragma: no cover
+    name = 'GS_large'
+
+    def get_path(self, idx, n_space, n_time=0, stats=False):
+        _name = '-stats' if stats else ''
+        return f'{self.base_path}/data/GrayScottLarge-res_{self.res}-useGPU_{self.useGPU}-procs_{self.procs[0]}_{self.procs[1]}_{self.procs[2]}-{n_time}-{n_space}-solution_{idx:06d}{_name}.pickle'
+
+    def plot_step_size(self):  # pragma: no cover
+        fig, ax = self.get_fig()
+        dt = get_sorted(self.stats, type='dt', recomputed=False)
+
+        ax.plot([me[0] for me in dt], [me[1] for me in dt], label=r'$\Delta t$')
+        ax.set_xlabel('$t$')
+        ax.set_ylabel(r'$\Delta t$')
+        ax.legend(frameon=False)
+        self.save_fig(fig, 'dt')
+
+    def plot_series(self, test=False):  # pragma: no cover
+        if test:
+            indices = [0, 1, 2, 3, 4, 5]
+            process = 0
+            layer = 0
+        else:
+            indices = [91]  # [0, 10, 20, 30, 40, 91]
+            process = 120  # 141#20#96  # 11
+            layer = 6
+
+        from mpl_toolkits.axes_grid1 import make_axes_locatable
+
+        # get grid
+        path = self.get_path(idx=0, n_space=process)
+        with open(path, 'rb') as file:
+            data = pickle.load(file)
+        X = data['X']
+
+        if tqdm:
+            frame_range = tqdm(indices)
+            frame_range.set_description('Plotting slice')
+
+        for frame in frame_range:
+
+            plt.rcParams['figure.constrained_layout.use'] = True
+            fig, ax = plt.subplots(1, 1, figsize=figsize_by_journal('TUHH_thesis', 0.7, 1.1))
+            divider = make_axes_locatable(ax)
+            cax = divider.append_axes('right', size='3%', pad=0.03)
+
+            path = self.get_path(idx=frame, n_space=process)
+            with open(path, 'rb') as file:
+                data = pickle.load(file)
+
+            # im = ax.pcolormesh(X[1][0], X[2][0], np.max(data['v'], axis=0), cmap='binary', rasterized=True, vmin=0, vmax=0.5)
+            # im = ax.pcolormesh(X[1][layer], X[2][layer], data['v'][layer], cmap='binary', rasterized=True, vmin=0, vmax=0.5)
+            im = ax.pcolormesh(
+                X[1][layer], X[2][layer], data['v'][layer], cmap='binary', rasterized=True, vmin=0, vmax=0.5
+            )
+            ax.set_xlim((9, 14))
+            ax.set_ylim((-4, 1))
+
+            fig.colorbar(im, cax)  # , label=f'$T(t={{{t:.1f}}})$')
+
+            ax.set_xlabel('$y$')
+            ax.set_ylabel('$z$')
+            ax.set_aspect(1)
+            self.save_fig(fig, f'series_{frame}_t={data["t"]:.0f}')
+
+
+if __name__ == '__main__':  # pragma: no cover
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--problem', type=str, default='GS', choices=['RBC', 'GS'])
+    parser.add_argument('--config', type=str, default='production', choices=['production', 'test'])
+    args = parser.parse_args()
+
+    setup_mpl()
+
+    if args.problem == 'RBC':
+        if args.config == 'test':
+            plotter = PlotRBC(256, [1, 4, 1], '.', 100)
+        else:
+            plotter = PlotRBC(4096, [1, 4, 1024], '/p/scratch/ccstma/baumann7/large_runs/', 200)
+    elif args.problem == 'GS':
+        if args.config == 'test':
+            plotter = PlotGS(64, [1, 1, 4], '.', 100)
+        else:
+            plotter = PlotGS(2304, [1, 4, 192], '/p/scratch/ccstma/baumann7/large_runs/', 91, 'True')
+
+    if args.problem == 'RBC':
+        if comm.size > 1:
+            plotter.compute_CFL_limit()
+            exit()
+
+        plotter.plot_residual()
+        plotter.plot_step_size()
+        plotter.plot_work()
+        plotter.plot_verification()
+        plotter.plot_series()
+    else:
+        plotter.plot_series(args.config == 'test')
+        plotter.plot_step_size()
+
+    plt.show()
diff --git a/pySDC/projects/GPU/configs/GS_configs.py b/pySDC/projects/GPU/configs/GS_configs.py
index f57cdc7ea2b6746b732a72f8182096beb85e6113..5fde8848d938fd5d6f05c716f0df7fef34d4d572 100644
--- a/pySDC/projects/GPU/configs/GS_configs.py
+++ b/pySDC/projects/GPU/configs/GS_configs.py
@@ -1,4 +1,5 @@
 from pySDC.projects.GPU.configs.base_config import Config
+from mpi4py_fft.distarray import newDistArray
 
 
 def get_config(args):
@@ -14,6 +15,10 @@ def get_config(args):
         return GrayScott_USkate(args)
     elif name == 'GS_scaling':
         return GrayScottScaling(args)
+    elif name == 'GS_scaling3D':
+        return GrayScottScaling3D(args)
+    elif name == 'GS_large':
+        return GrayScottLarge(args)
     else:
         return NotImplementedError(f'Don\'t know config {name}')
 
@@ -33,7 +38,7 @@ class GrayScott(Config):
         import numpy as np
         from pySDC.implementations.hooks.log_solution import LogToFileAfterXs as LogToFile
 
-        LogToFile.path = './data/'
+        LogToFile.path = f'{self.base_path}/data/'
         LogToFile.file_name = f'{self.get_path(ranks=ranks)}-solution'
         LogToFile.time_increment = self.Tend / self.num_frames
 
@@ -41,24 +46,31 @@ class GrayScott(Config):
             P = L.prob
 
             if P.spectral:
-                uend = P.itransform(L.uend)
+                tmpu = newDistArray(P.fft, False)
+                tmpv = newDistArray(P.fft, False)
+                tmpu[:] = P.fft.backward(L.uend[0, ...], tmpu)
+                tmpv[:] = P.fft.backward(L.uend[1, ...], tmpv)
+                uend = P.xp.stack((tmpu, tmpv))
             else:
                 uend = L.uend
 
+            data = {
+                't': L.time + L.dt,
+                'local_slice': P.fft.local_slice(False),
+                'shape': P.fft.global_shape(False),
+            }
+
             if P.useGPU:
-                return {
-                    't': L.time + L.dt,
-                    'u': uend[0].get().view(np.ndarray),
-                    'v': uend[1].get().view(np.ndarray),
-                    'X': L.prob.X.get().view(np.ndarray),
-                }
+                data['u'] = uend[0].get().view(np.ndarray)
+                data['v'] = uend[1].get().view(np.ndarray)
+                if L.time == 0:
+                    data['X'] = [me.get().view(np.ndarray) for me in L.prob.X]
             else:
-                return {
-                    't': L.time + L.dt,
-                    'u': uend[0],
-                    'v': uend[1],
-                    'X': L.prob.X,
-                }
+                data['u'] = uend[0]
+                data['v'] = uend[1]
+                if L.time == 0:
+                    data['X'] = L.prob.X
+            return data
 
         def logging_condition(L):
             sweep = L.sweep
@@ -74,7 +86,7 @@ class GrayScott(Config):
         LogToFile.logging_condition = logging_condition
         return LogToFile
 
-    def plot(self, P, idx, n_procs_list, projection='xy'):  # pragma: no cover
+    def plot(self, P, idx, n_procs_list, projection=0, projection_type='flat'):  # pragma: no cover
         import numpy as np
         from matplotlib import ticker as tkr
 
@@ -87,6 +99,7 @@ class GrayScott(Config):
         vmax = {'u': -np.inf, 'v': -np.inf}
 
         for rank in range(n_procs_list[2]):
+            # for rank in [n_procs_list[2] // 2]:
             ranks = [0, 0] + [rank]
             LogToFile = self.get_LogToFile(ranks=ranks)
 
@@ -112,29 +125,34 @@ class GrayScott(Config):
             else:
                 v3d = buffer[f'u-{rank}']['v'].real
 
-                if projection == 'xy':
+                if projection == 2:
                     slices = [slice(None), slice(None), v3d.shape[2] // 2]
                     x = buffer[f'u-{rank}']['X'][0][*slices]
                     y = buffer[f'u-{rank}']['X'][1][*slices]
                     ax.set_xlabel('$x$')
                     ax.set_ylabel('$y$')
-                elif projection == 'xz':
+                elif projection == 1:
                     slices = [slice(None), v3d.shape[1] // 2, slice(None)]
                     x = buffer[f'u-{rank}']['X'][0][*slices]
                     y = buffer[f'u-{rank}']['X'][2][*slices]
                     ax.set_xlabel('$x$')
                     ax.set_ylabel('$z$')
-                elif projection == 'yz':
+                elif projection == 0:
                     slices = [v3d.shape[0] // 2, slice(None), slice(None)]
                     x = buffer[f'u-{rank}']['X'][1][*slices]
                     y = buffer[f'u-{rank}']['X'][2][*slices]
                     ax.set_xlabel('$y$')
                     ax.set_ylabel('$z$')
 
+                if projection_type == 'sum':
+                    v = v3d.sum(axis=projection)
+                else:
+                    v = v3d[*slices]
+
                 im = ax.pcolormesh(
                     x,
                     y,
-                    v3d[*slices],
+                    v,
                     vmin=vmin['v'],
                     vmax=vmax['v'],
                     cmap='binary',
@@ -177,6 +195,8 @@ class GrayScott_dt_adaptivity(GrayScott):
     Configuration with dt adaptivity added to base configuration
     """
 
+    ndim = 2
+
     def get_description(self, *args, **kwargs):
         from pySDC.implementations.convergence_controller_classes.adaptivity import Adaptivity
 
@@ -208,8 +228,9 @@ class GrayScott_USkate(GrayScott):
     See arXiv:1501.01990 or http://www.mrob.com/sci/papers/2009smp-figs/index.html
     '''
 
-    num_frames = 400
+    num_frames = 200
     res_per_blob = 2**7
+    Tend = 200000
 
     def get_description(self, *args, **kwargs):
         from pySDC.implementations.convergence_controller_classes.adaptivity import Adaptivity
@@ -220,7 +241,6 @@ class GrayScott_USkate(GrayScott):
         desc['problem_params']['Du'] = 2e-5
         desc['problem_params']['Dv'] = 1e-5
         desc['convergence_controllers'][Adaptivity] = {'e_tol': 1e-3}
-        self.Tend = 200000
         return desc
 
 
@@ -238,3 +258,136 @@ class GrayScottScaling(GrayScott):
         params = super().get_controller_params(*args, **kwargs)
         params['hook_class'] = []
         return params
+
+
+class GrayScottScaling3D(GrayScottScaling):
+    ndim = 3
+    nsteps = 15
+
+    def get_description(self, *args, **kwargs):
+        from pySDC.implementations.convergence_controller_classes.adaptivity import Adaptivity
+
+        desc = super().get_description(*args, **kwargs)
+        desc['problem_params']['L'] = 2
+        desc['problem_params']['num_blobs'] = 4
+        desc['sweeper_params']['skip_residual_computation'] = ('IT_CHECK', 'IT_DOWN', 'IT_UP', 'IT_FINE', 'IT_COARSE')
+        desc['sweeper_params']['num_nodes'] = 4
+        desc['step_params']['maxiter'] = 4
+        desc['level_params']['dt'] = 0.1
+        self.Tend = self.nsteps * desc['level_params']['dt']
+        return desc
+
+    def get_controller_params(self, *args, **kwargs):
+        params = super().get_controller_params(*args, **kwargs)
+        params['hook_class'] = []
+        return params
+
+
+class GrayScottLarge(GrayScott):
+    Tend = 5000
+    num_frames = 100
+    res_per_blob = 2**7
+    ndim = 3
+
+    def get_description(self, *args, **kwargs):
+        from pySDC.implementations.convergence_controller_classes.adaptivity import Adaptivity
+
+        desc = super().get_description(*args, **kwargs)
+        desc['sweeper_params']['skip_residual_computation'] = ('IT_CHECK', 'IT_DOWN', 'IT_UP', 'IT_FINE', 'IT_COARSE')
+        desc['sweeper_params']['num_nodes'] = 4
+        desc['sweeper_params']['QI'] = 'MIN-SR-S'
+        desc['sweeper_params']['QE'] = 'PIC'
+        desc['step_params']['maxiter'] = 4
+        desc['level_params']['dt'] = 1e-1
+        desc['problem_params']['spectral'] = False
+
+        # desc['problem_params']['num_blobs'] *= -1
+        # desc['problem_params']['num_blobs'] = 40
+
+        desc['problem_params']['L'] = 2 * desc['problem_params']['nvars'][0] // self.res_per_blob
+        desc['problem_params']['num_blobs'] = int(1 * desc['problem_params']['L'])
+
+        desc['convergence_controllers'][Adaptivity] = {'e_tol': 1e-3}
+        return desc
+
+    def plot(self, P, idx, n_procs_list, projection=2, projection_type='flat'):  # pragma: no cover
+        import numpy as np
+        from matplotlib import ticker as tkr
+        import matplotlib.pyplot as plt
+        import gc
+
+        fig = plt.figure()
+        ax = fig.add_subplot(projection='3d')
+
+        thresh = 0.25
+
+        data = self.get_LogToFile().load(0)
+
+        min_pos = [-20, 2, 12]
+        # box_size = [4, 8, 8]
+        box_size = [
+            4,
+        ] * 3
+
+        ax.set_xlim(min_pos[0], min_pos[0] + box_size[0])
+        ax.set_ylim(min_pos[1], min_pos[1] + box_size[1])
+        ax.set_zlim(min_pos[2], min_pos[2] + box_size[2])
+        ax.set_aspect('equal')
+
+        grid = None
+        for rank in range(n_procs_list[2]):
+            gc.collect()
+            ranks = [0, 0] + [rank]
+            LogToFile = self.get_LogToFile(ranks=ranks)
+
+            data = LogToFile.load(idx)
+            u = data['v']
+            grid = data['X']
+            ax.set_title(f't={data["t"]:.2f}')
+
+            x = grid[0][:, 0, 0]
+            y = grid[1][0, :, 0]
+            z = grid[2][0, 0, :]
+            grids1d = [x, y, z]
+
+            if min_pos[0] > x.max():
+                continue
+            elif (min_pos[0] + box_size[0]) < x.min():
+                break
+
+            slice_starts = [np.searchsorted(grids1d[i], min_pos[i]) for i in range(self.ndim)]
+            slice_ends = [np.searchsorted(grids1d[i], min_pos[i] + box_size[i]) for i in range(self.ndim)]
+            slices = [slice(slice_starts[i], slice_ends[i]) for i in range(self.ndim)]
+            slice_data = [slice(slice_starts[i] + 1, slice_ends[i]) for i in range(self.ndim)]
+
+            if any(abs(slice_starts[i] - slice_ends[i]) <= 1 for i in range(self.ndim)):
+                continue
+
+            mask = u > thresh
+
+            if mask.any():
+                filled = np.zeros_like(u).astype(bool)
+                filled[mask] = True
+                ax.voxels(
+                    grid[0][*slices],
+                    grid[1][*slices],
+                    grid[2][*slices],
+                    filled[*slice_data],
+                    alpha=0.5,
+                    facecolors='teal',
+                )
+                # ax.scatter(grid[0][mask], grid[1][mask], grid[2][mask], alpha=0.1, color='black', marker='.')
+
+            gc.collect()
+
+        ax.set_xlabel('$x$')
+        ax.set_ylabel('$y$')
+        ax.set_zlabel('$z$')
+        return fig
+
+    def get_initial_condition(self, P, *args, restart_idx=0, **kwargs):
+        if restart_idx > 0:
+            return super().get_initial_condition(P, *args, restart_idx=restart_idx, **kwargs)
+        else:
+            _u0 = P.u_exact(t=0)
+            return _u0, 0
diff --git a/pySDC/projects/GPU/configs/RBC_configs.py b/pySDC/projects/GPU/configs/RBC_configs.py
new file mode 100644
index 0000000000000000000000000000000000000000..28cea340c6c904e8741823bef2a713677ada8d08
--- /dev/null
+++ b/pySDC/projects/GPU/configs/RBC_configs.py
@@ -0,0 +1,433 @@
+from pySDC.projects.GPU.configs.base_config import Config
+
+
+def get_config(args):
+    name = args['config']
+    if name == 'RBC':
+        return RayleighBenardRegular(args)
+    elif name == 'RBC_dt':
+        return RayleighBenard_dt_adaptivity(args)
+    elif name == 'RBC_k':
+        return RayleighBenard_k_adaptivity(args)
+    elif name == 'RBC_dt_k':
+        return RayleighBenard_dt_k_adaptivity(args)
+    elif name == 'RBC_RK':
+        return RayleighBenardRK(args)
+    elif name == 'RBC_dedalus':
+        return RayleighBenardDedalusComp(args)
+    elif name == 'RBC_Tibo':
+        return RayleighBenard_Thibaut(args)
+    elif name == 'RBC_scaling':
+        return RayleighBenard_scaling(args)
+    elif name == 'RBC_large':
+        return RayleighBenard_large(args)
+    else:
+        raise NotImplementedError(f'There is no configuration called {name!r}!')
+
+
+class RayleighBenardRegular(Config):
+    sweeper_type = 'IMEX'
+    Tend = 50
+
+    def get_LogToFile(self, ranks=None):
+        import numpy as np
+        from pySDC.implementations.hooks.log_solution import LogToFileAfterXs as LogToFile
+
+        LogToFile.path = f'{self.base_path}/data/'
+        LogToFile.file_name = f'{self.get_path(ranks=ranks)}-solution'
+        LogToFile.time_increment = 1e-1
+
+        def process_solution(L):
+            P = L.prob
+
+            if P.spectral_space:
+                uend = P.itransform(L.uend)
+
+            data = {
+                't': L.time + L.dt,
+                'local_slice': P.local_slice,
+                'shape': P.global_shape,
+            }
+
+            if P.useGPU:
+                data['u'] = uend.get().view(np.ndarray)
+                data['vorticity'] = L.prob.compute_vorticity(L.uend).get().view(np.ndarray)
+                if L.time == 0:
+                    data['X'] = L.prob.X.get()
+                    data['Z'] = L.prob.Z.get()
+            else:
+                data['u'] = uend.view(np.ndarray)
+                data['vorticity'] = L.prob.compute_vorticity(L.uend).view(np.ndarray)
+                if L.time == 0:
+                    data['X'] = L.prob.X
+                    data['Z'] = L.prob.Z
+            return data
+
+        def logging_condition(L):
+            sweep = L.sweep
+            if hasattr(sweep, 'comm'):
+                if sweep.comm.rank == sweep.comm.size - 1:
+                    return True
+                else:
+                    return False
+            else:
+                return True
+
+        LogToFile.process_solution = process_solution
+        LogToFile.logging_condition = logging_condition
+        return LogToFile
+
+    def get_controller_params(self, *args, **kwargs):
+        from pySDC.implementations.hooks.log_step_size import LogStepSize
+
+        controller_params = super().get_controller_params(*args, **kwargs)
+        controller_params['hook_class'] += [LogStepSize]
+        return controller_params
+
+    def get_description(self, *args, MPIsweeper=False, res=-1, **kwargs):
+        from pySDC.implementations.problem_classes.RayleighBenard import (
+            RayleighBenard,
+            CFLLimit,
+        )
+        from pySDC.implementations.problem_classes.generic_spectral import (
+            compute_residual_DAE,
+            compute_residual_DAE_MPI,
+        )
+        from pySDC.implementations.convergence_controller_classes.step_size_limiter import StepSizeSlopeLimiter
+
+        desc = super().get_description(*args, MPIsweeper=MPIsweeper, **kwargs)
+
+        if MPIsweeper:
+            desc['sweeper_class'].compute_residual = compute_residual_DAE_MPI
+        else:
+            desc['sweeper_class'].compute_residual = compute_residual_DAE
+
+        desc['level_params']['dt'] = 0.1
+        desc['level_params']['restol'] = 1e-7
+
+        desc['convergence_controllers'][CFLLimit] = {'dt_max': 0.1, 'dt_min': 1e-6, 'cfl': 0.8}
+        desc['convergence_controllers'][StepSizeSlopeLimiter] = {'dt_rel_min_slope': 0.1}
+
+        desc['sweeper_params']['quad_type'] = 'RADAU-RIGHT'
+        desc['sweeper_params']['num_nodes'] = 2
+        desc['sweeper_params']['QI'] = 'MIN-SR-S'
+        desc['sweeper_params']['QE'] = 'PIC'
+
+        desc['problem_params']['Rayleigh'] = 2e6
+        desc['problem_params']['nx'] = 2**8 if res == -1 else res
+        desc['problem_params']['nz'] = desc['problem_params']['nx'] // 4
+        desc['problem_params']['dealiasing'] = 3 / 2
+
+        desc['step_params']['maxiter'] = 3
+
+        desc['problem_class'] = RayleighBenard
+
+        return desc
+
+    def get_initial_condition(self, P, *args, restart_idx=0, **kwargs):
+        if restart_idx > 0:
+            return super().get_initial_condition(P, *args, restart_idx=restart_idx, **kwargs)
+        else:
+            u0 = P.u_exact(t=0, seed=P.comm.rank, noise_level=1e-3)
+            u0_with_pressure = P.solve_system(u0, 1e-9, u0)
+            return u0_with_pressure, 0
+
+    def prepare_caches(self, prob):
+        """
+        Cache the fft objects, which are expensive to create on GPU because graphs have to be initialized.
+        """
+        prob.eval_f(prob.u_init)
+
+    def plot(self, P, idx, n_procs_list, quantitiy='T', quantitiy2='vorticity'):
+        import numpy as np
+
+        cmaps = {'vorticity': 'bwr', 'p': 'bwr'}
+
+        fig = P.get_fig()
+        cax = P.cax
+        axs = fig.get_axes()
+
+        buffer = {}
+        vmin = {quantitiy: np.inf, quantitiy2: np.inf}
+        vmax = {quantitiy: -np.inf, quantitiy2: -np.inf}
+
+        X = {}
+        Z = {}
+
+        for rank in range(n_procs_list[2]):
+            ranks = self.ranks[:-1] + [rank]
+            LogToFile = self.get_LogToFile(ranks=ranks)
+
+            buffer[f'u-{rank}'] = LogToFile.load(idx)
+
+            vmin[quantitiy2] = min([vmin[quantitiy2], buffer[f'u-{rank}'][quantitiy2].real.min()])
+            vmax[quantitiy2] = max([vmax[quantitiy2], buffer[f'u-{rank}'][quantitiy2].real.max()])
+            vmin[quantitiy] = min([vmin[quantitiy], buffer[f'u-{rank}']['u'][P.index(quantitiy)].real.min()])
+            vmax[quantitiy] = max([vmax[quantitiy], buffer[f'u-{rank}']['u'][P.index(quantitiy)].real.max()])
+
+            first_frame = LogToFile.load(0)
+            X[rank] = first_frame['X']
+            Z[rank] = first_frame['Z']
+
+        for rank in range(n_procs_list[2]):
+            im2 = axs[1].pcolormesh(
+                X[rank],
+                Z[rank],
+                buffer[f'u-{rank}'][quantitiy2].real,
+                vmin=-vmax[quantitiy2] if cmaps.get(quantitiy2, None) in ['bwr'] else vmin[quantitiy2],
+                vmax=vmax[quantitiy2],
+                cmap=cmaps.get(quantitiy2, None),
+            )
+            im = axs[0].pcolormesh(
+                X[rank],
+                Z[rank],
+                buffer[f'u-{rank}']['u'][P.index(quantitiy)].real,
+                vmin=vmin[quantitiy],
+                vmax=-vmin[quantitiy] if cmaps.get(quantitiy, None) in ['bwr', 'seismic'] else vmax[quantitiy],
+                cmap=cmaps.get(quantitiy, 'plasma'),
+            )
+        fig.colorbar(im2, cax[1])
+        fig.colorbar(im, cax[0])
+        axs[0].set_title(f't={buffer[f"u-{rank}"]["t"]:.2f}')
+        axs[1].set_xlabel('x')
+        axs[1].set_ylabel('z')
+        axs[0].set_aspect(1.0)
+        axs[1].set_aspect(1.0)
+        return fig
+
+
+class RayleighBenard_k_adaptivity(RayleighBenardRegular):
+    def get_description(self, *args, **kwargs):
+        from pySDC.implementations.convergence_controller_classes.adaptivity import AdaptivityPolynomialError
+        from pySDC.implementations.problem_classes.RayleighBenard import CFLLimit
+
+        desc = super().get_description(*args, **kwargs)
+
+        desc['convergence_controllers'][CFLLimit] = {'dt_max': 0.1, 'dt_min': 1e-6, 'cfl': 0.8}
+        desc['level_params']['restol'] = 1e-7
+        desc['sweeper_params']['num_nodes'] = 4
+        desc['sweeper_params']['QI'] = 'MIN-SR-S'
+        desc['step_params']['maxiter'] = 12
+
+        return desc
+
+    def get_controller_params(self, *args, **kwargs):
+        from pySDC.implementations.problem_classes.RayleighBenard import (
+            LogAnalysisVariables,
+        )
+
+        controller_params = super().get_controller_params(*args, **kwargs)
+        controller_params['hook_class'] = [
+            me for me in controller_params['hook_class'] if me is not LogAnalysisVariables
+        ]
+        return controller_params
+
+
+class RayleighBenard_dt_k_adaptivity(RayleighBenardRegular):
+    def get_description(self, *args, **kwargs):
+        from pySDC.implementations.convergence_controller_classes.adaptivity import AdaptivityPolynomialError
+        from pySDC.implementations.problem_classes.RayleighBenard import CFLLimit
+
+        desc = super().get_description(*args, **kwargs)
+
+        desc['convergence_controllers'][AdaptivityPolynomialError] = {
+            'e_tol': 1e-3,
+            'abort_at_growing_residual': False,
+            'interpolate_between_restarts': False,
+            'dt_min': 1e-3,
+            'dt_rel_min_slope': 0.1,
+        }
+        desc['convergence_controllers'].pop(CFLLimit)
+        desc['level_params']['restol'] = 1e-7
+        desc['sweeper_params']['num_nodes'] = 3
+        desc['sweeper_params']['QI'] = 'MIN-SR-S'
+        desc['step_params']['maxiter'] = 16
+        desc['problem_params']['nx'] *= 2
+        desc['problem_params']['nz'] *= 2
+
+        return desc
+
+    def get_controller_params(self, *args, **kwargs):
+        from pySDC.implementations.problem_classes.RayleighBenard import (
+            LogAnalysisVariables,
+        )
+
+        controller_params = super().get_controller_params(*args, **kwargs)
+        controller_params['hook_class'] = [
+            me for me in controller_params['hook_class'] if me is not LogAnalysisVariables
+        ]
+        return controller_params
+
+
+class RayleighBenard_dt_adaptivity(RayleighBenardRegular):
+    def get_description(self, *args, **kwargs):
+        from pySDC.implementations.convergence_controller_classes.adaptivity import Adaptivity
+        from pySDC.implementations.problem_classes.RayleighBenard import CFLLimit
+
+        desc = super().get_description(*args, **kwargs)
+
+        desc['convergence_controllers'][Adaptivity] = {'e_tol': 1e-4, 'dt_rel_min_slope': 0.1}
+        desc['convergence_controllers'].pop(CFLLimit)
+        desc['level_params']['restol'] = -1
+        desc['sweeper_params']['num_nodes'] = 3
+        desc['sweeper_params']['skip_residual_computation'] = ('IT_CHECK', 'IT_DOWN', 'IT_UP', 'IT_FINE', 'IT_COARSE')
+        desc['step_params']['maxiter'] = 5
+        return desc
+
+
+class RayleighBenard_Thibaut(RayleighBenardRegular):
+    Tend = 1
+
+    def get_description(self, *args, **kwargs):
+        from pySDC.implementations.problem_classes.RayleighBenard import CFLLimit
+
+        desc = super().get_description(*args, **kwargs)
+
+        desc['convergence_controllers'].pop(CFLLimit)
+        desc['level_params']['restol'] = -1
+        desc['level_params']['dt'] = 2e-2 / 4
+        desc['sweeper_params']['num_nodes'] = 4
+        desc['sweeper_params']['QI'] = 'MIN-SR-S'
+        desc['sweeper_params']['node_type'] = 'LEGENDRE'
+        desc['sweeper_params']['quad_type'] = 'RADAU-RIGHT'
+        desc['sweeper_params']['skip_residual_computation'] = ('IT_CHECK', 'IT_DOWN', 'IT_UP', 'IT_FINE', 'IT_COARSE')
+        desc['step_params']['maxiter'] = 4
+        return desc
+
+    def get_controller_params(self, *args, **kwargs):
+        controller_params = super().get_controller_params(*args, **kwargs)
+        controller_params['hook_class'] = []
+        return controller_params
+
+
+class RayleighBenardRK(RayleighBenardRegular):
+
+    def get_description(self, *args, **kwargs):
+        from pySDC.implementations.sweeper_classes.Runge_Kutta import ARK3
+        from pySDC.implementations.problem_classes.RayleighBenard import CFLLimit
+        from pySDC.implementations.convergence_controller_classes.step_size_limiter import StepSizeSlopeLimiter
+
+        desc = super().get_description(*args, **kwargs)
+
+        desc['sweeper_class'] = ARK3
+
+        desc['step_params']['maxiter'] = 1
+
+        desc['convergence_controllers'][CFLLimit] = {'dt_max': 0.1, 'dt_min': 1e-6, 'cfl': 0.5}
+        desc['convergence_controllers'][StepSizeSlopeLimiter] = {'dt_rel_min_slope': 0.1}
+        return desc
+
+    def get_controller_params(self, *args, **kwargs):
+        from pySDC.implementations.problem_classes.RayleighBenard import (
+            LogAnalysisVariables,
+        )
+
+        controller_params = super().get_controller_params(*args, **kwargs)
+        controller_params['hook_class'] = [
+            me for me in controller_params['hook_class'] if me is not LogAnalysisVariables
+        ]
+        return controller_params
+
+
+class RayleighBenardDedalusComp(RayleighBenardRK):
+    Tend = 150
+
+    def get_description(self, *args, **kwargs):
+        from pySDC.implementations.sweeper_classes.Runge_Kutta import ARK222
+
+        desc = super().get_description(*args, **kwargs)
+
+        desc['sweeper_class'] = ARK222
+
+        desc['step_params']['maxiter'] = 1
+        desc['level_params']['dt'] = 5e-3
+
+        from pySDC.implementations.problem_classes.RayleighBenard import CFLLimit
+
+        desc['convergence_controllers'].pop(CFLLimit)
+        return desc
+
+
+class RayleighBenard_scaling(RayleighBenardRegular):
+    Tend = 7
+
+    def get_description(self, *args, **kwargs):
+        from pySDC.implementations.convergence_controller_classes.adaptivity import Adaptivity
+        from pySDC.implementations.problem_classes.RayleighBenard import CFLLimit
+        from pySDC.implementations.convergence_controller_classes.step_size_limiter import (
+            StepSizeRounding,
+            StepSizeSlopeLimiter,
+        )
+
+        desc = super().get_description(*args, **kwargs)
+
+        desc['convergence_controllers'][Adaptivity] = {'e_tol': 1e-3, 'dt_rel_min_slope': 1.0, 'beta': 0.5}
+        desc['convergence_controllers'][StepSizeRounding] = {}
+        desc['convergence_controllers'][StepSizeSlopeLimiter] = {'dt_rel_min_slope': 1.0}
+        desc['convergence_controllers'].pop(CFLLimit)
+        desc['level_params']['restol'] = -1
+        desc['level_params']['dt'] = 8e-2
+        desc['sweeper_params']['num_nodes'] = 4
+        desc['step_params']['maxiter'] = 4
+        desc['problem_params']['max_cached_factorizations'] = 4
+        return desc
+
+    def get_controller_params(self, *args, **kwargs):
+        from pySDC.implementations.hooks.log_work import LogWork
+
+        params = super().get_controller_params(*args, **kwargs)
+        params['hook_class'] = [LogWork]
+        return params
+
+
+class RayleighBenard_large(RayleighBenardRegular):
+    Ra = 3.2e8
+    relaxation_steps = 5
+
+    def get_description(self, *args, **kwargs):
+        from pySDC.implementations.convergence_controller_classes.adaptivity import AdaptivityPolynomialError
+        from pySDC.implementations.problem_classes.RayleighBenard import CFLLimit
+        from pySDC.implementations.convergence_controller_classes.step_size_limiter import StepSizeRounding
+
+        desc = super().get_description(*args, **kwargs)
+
+        desc['convergence_controllers'][AdaptivityPolynomialError] = {
+            'e_tol': 1e-5,
+            'abort_at_growing_residual': False,
+            'interpolate_between_restarts': False,
+            'dt_min': 1e-5,
+            'dt_rel_min_slope': 2,
+            'beta': 0.5,
+        }
+        desc['convergence_controllers'][StepSizeRounding] = {}
+        desc['convergence_controllers'].pop(CFLLimit)
+        desc['level_params']['restol'] = 5e-6
+        desc['level_params']['e_tol'] = 5e-6
+        desc['level_params']['dt'] = 5e-3
+        desc['sweeper_params']['num_nodes'] = 4
+        desc['sweeper_params']['QI'] = 'MIN-SR-S'
+        desc['sweeper_params']['QE'] = 'PIC'
+        desc['step_params']['maxiter'] = 16
+
+        desc['problem_params']['Rayleigh'] = self.Ra
+        desc['problem_params']['max_cached_factorizations'] = 4
+
+        return desc
+
+    def get_controller_params(self, *args, **kwargs):
+        from pySDC.implementations.problem_classes.RayleighBenard import (
+            LogAnalysisVariables,
+        )
+
+        controller_params = super().get_controller_params(*args, **kwargs)
+        controller_params['hook_class'] += [LogAnalysisVariables]
+        return controller_params
+
+    def get_initial_condition(self, P, *args, restart_idx=0, **kwargs):
+        if restart_idx > 0:
+            return super().get_initial_condition(P, *args, restart_idx=restart_idx, **kwargs)
+        else:
+            u0 = P.u_exact(t=0, seed=P.comm.rank, noise_level=1e-3)
+            for _ in range(self.relaxation_steps):
+                u0 = P.solve_system(u0, 1e-1, u0)
+            return u0, 0
diff --git a/pySDC/projects/GPU/configs/base_config.py b/pySDC/projects/GPU/configs/base_config.py
index 00755c6921fe1d7501024926e069c37dfdfd7a1a..51fc3ecf696d37066b9a8f1744c964874daf0686 100644
--- a/pySDC/projects/GPU/configs/base_config.py
+++ b/pySDC/projects/GPU/configs/base_config.py
@@ -6,12 +6,14 @@ import numpy as np
 def get_config(args):
     name = args['config']
     if name[:2] == 'GS':
-        from pySDC.projects.GPU.configs.GS_configs import get_config
-
-        return get_config(args)
+        from pySDC.projects.GPU.configs.GS_configs import get_config as _get_config
+    elif name[:3] == 'RBC':
+        from pySDC.projects.GPU.configs.RBC_configs import get_config as _get_config
     else:
         raise NotImplementedError(f'There is no configuration called {name!r}!')
 
+    return _get_config(args)
+
 
 def get_comms(n_procs_list, comm_world=None, _comm=None, _tot_rank=0, _rank=None, useGPU=False):
     from mpi4py import MPI
@@ -56,6 +58,7 @@ def get_comms(n_procs_list, comm_world=None, _comm=None, _tot_rank=0, _rank=None
 class Config(object):
     sweeper_type = None
     Tend = None
+    base_path = './'
 
     def __init__(self, args, comm_world=None):
         from mpi4py import MPI
@@ -66,7 +69,7 @@ class Config(object):
         if args['mode'] == 'run':
             self.comms = get_comms(n_procs_list=self.n_procs_list, useGPU=args['useGPU'], comm_world=self.comm_world)
         else:
-            self.comms = [self.comm_world, self.comm_world, self.comm_world]
+            self.comms = [MPI.COMM_SELF, MPI.COMM_SELF, MPI.COMM_SELF]
         self.ranks = [me.rank for me in self.comms]
 
     def get_description(self, *args, MPIsweeper=False, useGPU=False, **kwargs):
@@ -88,10 +91,12 @@ class Config(object):
 
     def get_controller_params(self, *args, logger_level=15, **kwargs):
         from pySDC.implementations.hooks.log_work import LogWork
+        from pySDC.implementations.hooks.log_step_size import LogStepSize
+        from pySDC.implementations.hooks.log_restarts import LogRestarts
 
         controller_params = {}
         controller_params['logger_level'] = logger_level if self.comm_world.rank == 0 else 40
-        controller_params['hook_class'] = [LogWork]
+        controller_params['hook_class'] = [LogWork, LogStepSize, LogRestarts]
         logToFile = self.get_LogToFile()
         if logToFile:
             controller_params['hook_class'] += [logToFile]
@@ -112,6 +117,9 @@ class Config(object):
 
         return sweeper
 
+    def prepare_caches(self, prob):
+        pass
+
     def get_path(self, *args, ranks=None, **kwargs):
         ranks = self.ranks if ranks is None else ranks
         return f'{type(self).__name__}{self.args_to_str()}-{ranks[0]}-{ranks[2]}'
@@ -190,10 +198,17 @@ class LogStats(ConvergenceController):
         for _hook in controller.hooks:
             _hook.post_step(S, 0)
 
+        P = S.levels[0].prob
+        skip_logging = False
+        if hasattr(P, 'comm'):
+            if P.comm.rank > 0:
+                skip_logging = True
+                return None
+
         while self.counter < hook.counter:
             path = self.get_stats_path(hook, index=self.counter)
             stats = controller.return_stats()
-            if hook.logging_condition(S.levels[0]):
+            if hook.logging_condition(S.levels[0]) and not skip_logging:
                 with open(path, 'wb') as file:
                     pickle.dump(stats, file)
                     self.log(f'Stored stats in {path!r}', S)
diff --git a/pySDC/projects/GPU/paper_plots.py b/pySDC/projects/GPU/paper_plots.py
new file mode 100644
index 0000000000000000000000000000000000000000..b56b1f36bd7c5b31c2b6e1dacbc90dd40b56a37b
--- /dev/null
+++ b/pySDC/projects/GPU/paper_plots.py
@@ -0,0 +1,78 @@
+""" Make plots for publications """
+
+import matplotlib.pyplot as plt
+from pySDC.helpers.plot_helper import setup_mpl, figsize_by_journal
+
+
+def plot_scalings_separately(problem, journal='TUHH_thesis', **kwargs):  # pragma: no cover
+    from pySDC.projects.GPU.analysis_scripts.parallel_scaling import (
+        plot_scalings,
+        GrayScottSpaceScalingGPU3D,
+        GrayScottSpaceScalingCPU3D,
+        RayleighBenardSpaceScalingCPU,
+        RayleighBenardSpaceScalingGPU,
+        PROJECT_PATH,
+    )
+
+    if problem == 'GS3D':
+        configs = [
+            GrayScottSpaceScalingCPU3D(),
+            GrayScottSpaceScalingGPU3D(),
+        ]
+    elif problem == 'RBC':
+        configs = [
+            RayleighBenardSpaceScalingCPU(),
+            RayleighBenardSpaceScalingGPU(),
+        ]
+
+    else:
+        raise NotImplementedError
+
+    ideal_lines = {
+        ('GS3D', 'throughput'): {'x': [0.25, 400], 'y': [5e6, 8e9]},
+        ('GS3D', 'time'): {'x': [0.25, 400], 'y': [80, 5e-2]},
+    }
+
+    fig, ax = plt.subplots(figsize=figsize_by_journal(journal, 1, 0.6))
+    configs[1].plot_scaling_test(ax=ax, quantity='efficiency')
+    ax.legend(frameon=False)
+    path = f'{PROJECT_PATH}/plots/scaling_{problem}_efficiency.pdf'
+    fig.savefig(path, bbox_inches='tight')
+    print(f'Saved {path!r}', flush=True)
+
+    for quantity in ['time']:
+        fig, ax = plt.subplots(figsize=figsize_by_journal(journal, 1, 0.6))
+        for config in configs:
+            config.plot_scaling_test(ax=ax, quantity=quantity)
+        if (problem, quantity) in ideal_lines.keys():
+            ax.loglog(*ideal_lines[(problem, quantity)].values(), color='black', ls=':', label='ideal')
+        ax.legend(frameon=False)
+        path = f'{PROJECT_PATH}/plots/scaling_{problem}_{quantity}.pdf'
+        fig.savefig(path, bbox_inches='tight')
+        print(f'Saved {path!r}', flush=True)
+
+
+def make_plots_for_thesis():  # pragma: no cover
+    from pySDC.projects.GPU.analysis_scripts.plot_RBC_matrix import plot_DCT, plot_preconditioners, plot_ultraspherical
+
+    # small plots with no simulations
+    plot_DCT()
+    plot_preconditioners()
+    plot_ultraspherical()
+
+    # plot space-time parallel scaling
+    for problem in ['GS3D', 'RBC']:
+        plot_scalings_separately(problem=problem)
+
+
+if __name__ == '__main__':
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--target', choices=['thesis'], type=str)
+    args = parser.parse_args()
+
+    if args.target == 'thesis':
+        make_plots_for_thesis()
+    else:
+        raise NotImplementedError(f'Don\'t know how to make plots for target {args.target}')
diff --git a/pySDC/projects/GPU/run_experiment.py b/pySDC/projects/GPU/run_experiment.py
index 35b7bbc64a1d497a8c7b26a39ffb9f10e0794451..c0726861ae0c434f217602a09de7fbbe0b674bb3 100644
--- a/pySDC/projects/GPU/run_experiment.py
+++ b/pySDC/projects/GPU/run_experiment.py
@@ -12,7 +12,11 @@ def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument('--useGPU', type=cast_to_bool, help='Toggle for GPUs', default=False)
     parser.add_argument(
-        '--mode', type=str, help='Mode for this script', default=None, choices=['run', 'plot', 'render', 'video']
+        '--mode',
+        type=str,
+        help='Mode for this script',
+        default=None,
+        choices=['run', 'plot', 'render', 'plot_series', 'video'],
     )
     parser.add_argument('--config', type=str, help='Configuration to load', default=None)
     parser.add_argument('--restart_idx', type=int, help='Restart from file by index', default=0)
@@ -21,6 +25,7 @@ def parse_args():
     parser.add_argument(
         '--logger_level', type=int, help='Logger level on the first rank in space and in the sweeper', default=15
     )
+    parser.add_argument('-o', type=str, help='output path', default='./')
 
     return vars(parser.parse_args())
 
@@ -32,11 +37,17 @@ def run_experiment(args, config, **kwargs):
     from pySDC.implementations.controller_classes.controller_nonMPI import controller_nonMPI
     from pySDC.helpers.stats_helper import filter_stats
 
+    type(config).base_path = args['o']
     description = config.get_description(
         useGPU=args['useGPU'], MPIsweeper=args['procs'][1] > 1, res=args['res'], **kwargs
     )
     controller_params = config.get_controller_params(logger_level=args['logger_level'])
 
+    if args['useGPU']:
+        from pySDC.implementations.hooks.log_timings import GPUTimings
+
+        controller_params['hook_class'].append(GPUTimings)
+
     # controller = controller_MPI(controller_params, description, config.comms[0])
     assert (
         config.comms[0].size == 1
@@ -46,12 +57,15 @@ def run_experiment(args, config, **kwargs):
 
     u0, t0 = config.get_initial_condition(prob, restart_idx=args['restart_idx'])
 
+    config.prepare_caches(prob)
+
     uend, stats = controller.run(u0=u0, t0=t0, Tend=config.Tend)
 
     combined_stats = filter_stats(stats, comm=config.comm_world)
 
     if config.comm_world.rank == config.comm_world.size - 1:
-        path = f'data/{config.get_path()}-stats-whole-run.pickle'
+        path = f'{config.base_path}/data/{config.get_path()}-stats-whole-run.pickle'
+        # path = f'/p/project1/ccstma/baumann7/pySDC/pySDC/projects/GPU/data/{config.get_path()}-stats-whole-run.pickle'
         with open(path, 'wb') as file:
             pickle.dump(combined_stats, file)
         print(f'Stored stats in {path}', flush=True)
@@ -63,6 +77,7 @@ def plot_experiment(args, config):  # pragma: no cover
     import gc
     import matplotlib.pyplot as plt
 
+    type(config).base_path = args['o']
     description = config.get_description()
 
     P = description['problem_class'](**description['problem_params'])
@@ -87,6 +102,46 @@ def plot_experiment(args, config):  # pragma: no cover
         gc.collect()
 
 
+def plot_series(args, config):  # pragma: no cover
+    import numpy as np
+    import gc
+    import matplotlib.pyplot as plt
+    from pySDC.helpers.plot_helper import figsize_by_journal, setup_mpl
+
+    setup_mpl()
+
+    fig, axs = plt.subplots(
+        3, 3, figsize=figsize_by_journal("TUHH_thesis", 1.0, 1.2), sharex=True, sharey=True, layout="compressed"
+    )
+
+    type(config).base_path = args['o']
+    description = config.get_description()
+
+    P = description['problem_class'](**description['problem_params'])
+
+    idxs = np.linspace(0, config.num_frames * 0.9, 9, dtype=int)
+
+    for idx, ax in zip(idxs, axs.flatten()):
+        try:
+            _fig = config.plot(P=P, idx=idx, n_procs_list=args['procs'], ax=ax)
+        except FileNotFoundError:
+            break
+
+        plt.close(_fig)
+        del _fig
+        gc.collect()
+
+    for ax in axs.flatten():
+        if ax != axs[-1, 0]:
+            ax.set_xlabel('')
+            ax.set_ylabel('')
+
+    path = f'plots/{config.get_path(ranks=[0,0,0])}-series.pdf'
+    fig.savefig(path, dpi=300)
+    print(f'Stored figure {path!r}', flush=True)
+    plt.show()
+
+
 def make_video(args, config):  # pragma: no cover
     comm = config.comm_world
     if comm.rank > 0:
@@ -97,7 +152,7 @@ def make_video(args, config):  # pragma: no cover
     path = f'simulation_plots/{config.get_path(ranks=[0,0,0])}-%06d.png'
     path_target = f'videos/{args["config"]}.mp4'
 
-    cmd = f'ffmpeg -i {path} -pix_fmt yuv420p -r 9 -s 2048:1536 {path_target}'.split()
+    cmd = f'ffmpeg -i {path} -pix_fmt yuv420p -r 9 -s 2048:1536 -y {path_target}'.split()
 
     subprocess.run(cmd)
 
@@ -113,5 +168,7 @@ if __name__ == '__main__':
         run_experiment(args, config)
     elif args['mode'] in ['plot', 'render']:  # pragma: no cover
         plot_experiment(args, config)
+    elif args['mode'] == 'plot_series':  # pragma: no cover
+        plot_series(args, config)
     elif args['mode'] == 'video':  # pragma: no cover
         make_video(args, config)