From 05cf836e5f565be9f41547a7aada47a2da30075d Mon Sep 17 00:00:00 2001
From: Thomas Baumann <39156931+brownbaerchen@users.noreply.github.com>
Date: Wed, 13 Nov 2024 09:23:33 +0100
Subject: [PATCH] Implemented hook for measuring GPU times

---
 pySDC/core/controller.py                    |   3 +-
 pySDC/implementations/hooks/default_hook.py | 251 --------------
 pySDC/implementations/hooks/log_timings.py  | 341 ++++++++++++++++++++
 3 files changed, 343 insertions(+), 252 deletions(-)
 create mode 100644 pySDC/implementations/hooks/log_timings.py

diff --git a/pySDC/core/controller.py b/pySDC/core/controller.py
index 2549cc622..e5b2748a3 100644
--- a/pySDC/core/controller.py
+++ b/pySDC/core/controller.py
@@ -7,6 +7,7 @@ from pySDC.core.base_transfer import BaseTransfer
 from pySDC.helpers.pysdc_helper import FrozenClass
 from pySDC.implementations.convergence_controller_classes.check_convergence import CheckConvergence
 from pySDC.implementations.hooks.default_hook import DefaultHooks
+from pySDC.implementations.hooks.log_timings import CPUTimings
 
 
 # short helper class to add params as attributes
@@ -43,7 +44,7 @@ class Controller(object):
 
         # check if we have a hook on this list. If not, use default class.
         self.__hooks = []
-        hook_classes = [DefaultHooks]
+        hook_classes = [DefaultHooks, CPUTimings]
         user_hooks = controller_params.get('hook_class', [])
         hook_classes += user_hooks if type(user_hooks) == list else [user_hooks]
         [self.add_hook(hook) for hook in hook_classes]
diff --git a/pySDC/implementations/hooks/default_hook.py b/pySDC/implementations/hooks/default_hook.py
index c9d83feb5..3c800b665 100644
--- a/pySDC/implementations/hooks/default_hook.py
+++ b/pySDC/implementations/hooks/default_hook.py
@@ -5,155 +5,8 @@ from pySDC.core.hooks import Hooks
 class DefaultHooks(Hooks):
     """
     Hook class to contain the functions called during the controller runs (e.g. for calling user-routines)
-
-    Attributes:
-        __t0_setup (float): private variable to get starting time of setup
-        __t0_run (float): private variable to get starting time of the run
-        __t0_predict (float): private variable to get starting time of the predictor
-        __t0_step (float): private variable to get starting time of the step
-        __t0_iteration (float): private variable to get starting time of the iteration
-        __t0_sweep (float): private variable to get starting time of the sweep
-        __t0_comm (list): private variable to get starting time of the communication
-        __t1_run (float): private variable to get end time of the run
-        __t1_predict (float): private variable to get end time of the predictor
-        __t1_step (float): private variable to get end time of the step
-        __t1_iteration (float): private variable to get end time of the iteration
-        __t1_sweep (float): private variable to get end time of the sweep
-        __t1_setup (float): private variable to get end time of setup
-        __t1_comm (list): private variable to hold timing of the communication (!)
     """
 
-    def __init__(self):
-        super().__init__()
-        self.__t0_setup = None
-        self.__t0_run = None
-        self.__t0_predict = None
-        self.__t0_step = None
-        self.__t0_iteration = None
-        self.__t0_sweep = None
-        self.__t0_comm = []
-        self.__t1_run = None
-        self.__t1_predict = None
-        self.__t1_step = None
-        self.__t1_iteration = None
-        self.__t1_sweep = None
-        self.__t1_setup = None
-        self.__t1_comm = []
-
-    def pre_setup(self, step, level_number):
-        """
-        Default routine called before setup starts
-
-        Args:
-            step (pySDC.Step.step): the current step
-            level_number (int): the current level number
-        """
-        super().pre_setup(step, level_number)
-        self.__t0_setup = time.perf_counter()
-
-    def pre_run(self, step, level_number):
-        """
-        Default routine called before time-loop starts
-
-        Args:
-            step (pySDC.Step.step): the current step
-            level_number (int): the current level number
-        """
-        super().pre_run(step, level_number)
-        self.__t0_run = time.perf_counter()
-
-    def pre_predict(self, step, level_number):
-        """
-        Default routine called before predictor starts
-
-        Args:
-            step (pySDC.Step.step): the current step
-            level_number (int): the current level number
-        """
-        super().pre_predict(step, level_number)
-        self.__t0_predict = time.perf_counter()
-
-    def pre_step(self, step, level_number):
-        """
-        Hook called before each step
-
-        Args:
-            step (pySDC.Step.step): the current step
-            level_number (int): the current level number
-        """
-        super().pre_step(step, level_number)
-        self.__t0_step = time.perf_counter()
-
-    def pre_iteration(self, step, level_number):
-        """
-        Default routine called before iteration starts
-
-        Args:
-            step (pySDC.Step.step): the current step
-            level_number (int): the current level number
-        """
-        super().pre_iteration(step, level_number)
-        self.__t0_iteration = time.perf_counter()
-
-    def pre_sweep(self, step, level_number):
-        """
-        Default routine called before sweep starts
-
-        Args:
-            step (pySDC.Step.step): the current step
-            level_number (int): the current level number
-        """
-        super().pre_sweep(step, level_number)
-        self.__t0_sweep = time.perf_counter()
-
-    def pre_comm(self, step, level_number):
-        """
-        Default routine called before communication starts
-
-        Args:
-            step (pySDC.Step.step): the current step
-            level_number (int): the current level number
-        """
-        super().pre_comm(step, level_number)
-        if len(self.__t0_comm) >= level_number + 1:
-            self.__t0_comm[level_number] = time.perf_counter()
-        else:
-            while len(self.__t0_comm) < level_number:
-                self.__t0_comm.append(None)
-            self.__t0_comm.append(time.perf_counter())
-            while len(self.__t1_comm) <= level_number:
-                self.__t1_comm.append(0.0)
-            assert len(self.__t0_comm) == level_number + 1
-            assert len(self.__t1_comm) == level_number + 1
-
-    def post_comm(self, step, level_number, add_to_stats=False):
-        """
-        Default routine called after each communication
-
-        Args:
-            step (pySDC.Step.step): the current step
-            level_number (int): the current level number
-            add_to_stats (bool): set if result should go to stats object
-        """
-        super().post_comm(step, level_number)
-        assert len(self.__t1_comm) >= level_number + 1
-        self.__t1_comm[level_number] += time.perf_counter() - self.__t0_comm[level_number]
-
-        if add_to_stats:
-            L = step.levels[level_number]
-
-            self.add_to_stats(
-                process=step.status.slot,
-                process_sweeper=L.sweep.rank,
-                time=L.time,
-                level=L.level_index,
-                iter=step.status.iter,
-                sweep=L.status.sweep,
-                type='timing_comm',
-                value=self.__t1_comm[level_number],
-            )
-            self.__t1_comm[level_number] = 0.0
-
     def post_sweep(self, step, level_number):
         """
         Default routine called after each sweep
@@ -163,7 +16,6 @@ class DefaultHooks(Hooks):
             level_number (int): the current level number
         """
         super().post_sweep(step, level_number)
-        self.__t1_sweep = time.perf_counter()
 
         L = step.levels[level_number]
 
@@ -188,16 +40,6 @@ class DefaultHooks(Hooks):
             type='residual_post_sweep',
             value=L.status.residual,
         )
-        self.add_to_stats(
-            process=step.status.slot,
-            process_sweeper=L.sweep.rank,
-            time=L.time,
-            level=L.level_index,
-            iter=step.status.iter,
-            sweep=L.status.sweep,
-            type='timing_sweep',
-            value=self.__t1_sweep - self.__t0_sweep,
-        )
 
     def post_iteration(self, step, level_number):
         """
@@ -208,7 +50,6 @@ class DefaultHooks(Hooks):
             level_number (int): the current level number
         """
         super().post_iteration(step, level_number)
-        self.__t1_iteration = time.perf_counter()
 
         L = step.levels[level_number]
 
@@ -222,16 +63,6 @@ class DefaultHooks(Hooks):
             type='residual_post_iteration',
             value=L.status.residual,
         )
-        self.add_to_stats(
-            process=step.status.slot,
-            process_sweeper=L.sweep.rank,
-            time=L.time,
-            level=L.level_index,
-            iter=step.status.iter,
-            sweep=L.status.sweep,
-            type='timing_iteration',
-            value=self.__t1_iteration - self.__t0_iteration,
-        )
 
     def post_step(self, step, level_number):
         """
@@ -242,20 +73,9 @@ class DefaultHooks(Hooks):
             level_number (int): the current level number
         """
         super().post_step(step, level_number)
-        self.__t1_step = time.perf_counter()
 
         L = step.levels[level_number]
 
-        self.add_to_stats(
-            process=step.status.slot,
-            process_sweeper=L.sweep.rank,
-            time=L.time,
-            level=L.level_index,
-            iter=step.status.iter,
-            sweep=L.status.sweep,
-            type='timing_step',
-            value=self.__t1_step - self.__t0_step,
-        )
         self.add_to_stats(
             process=step.status.slot,
             process_sweeper=L.sweep.rank,
@@ -289,74 +109,3 @@ class DefaultHooks(Hooks):
                 value=step.status.get('restart'),
                 process_sweeper=-1,
             )
-
-    def post_predict(self, step, level_number):
-        """
-        Default routine called after each predictor
-
-        Args:
-            step (pySDC.Step.step): the current step
-            level_number (int): the current level number
-        """
-        super().post_predict(step, level_number)
-        self.__t1_predict = time.perf_counter()
-
-        L = step.levels[level_number]
-
-        self.add_to_stats(
-            process=step.status.slot,
-            process_sweeper=L.sweep.rank,
-            time=L.time,
-            level=L.level_index,
-            iter=step.status.iter,
-            sweep=L.status.sweep,
-            type='timing_predictor',
-            value=self.__t1_predict - self.__t0_predict,
-        )
-
-    def post_run(self, step, level_number):
-        """
-        Default routine called after each run
-
-        Args:
-            step (pySDC.Step.step): the current step
-            level_number (int): the current level number
-        """
-        super().post_run(step, level_number)
-        self.__t1_run = time.perf_counter()
-
-        L = step.levels[level_number]
-
-        self.add_to_stats(
-            process=step.status.slot,
-            process_sweeper=L.sweep.rank,
-            time=L.time,
-            level=L.level_index,
-            iter=step.status.iter,
-            sweep=L.status.sweep,
-            type='timing_run',
-            value=self.__t1_run - self.__t0_run,
-        )
-        self.logger.info(f'Finished run after {self.__t1_run - self.__t0_run:.2f}s')
-
-    def post_setup(self, step, level_number):
-        """
-        Default routine called after setup
-
-        Args:
-            step (pySDC.Step.step): the current step
-            level_number (int): the current level number
-        """
-        super().post_setup(step, level_number)
-        self.__t1_setup = time.perf_counter()
-
-        self.add_to_stats(
-            process=-1,
-            process_sweeper=-1,
-            time=-1,
-            level=-1,
-            iter=-1,
-            sweep=-1,
-            type='timing_setup',
-            value=self.__t1_setup - self.__t0_setup,
-        )
diff --git a/pySDC/implementations/hooks/log_timings.py b/pySDC/implementations/hooks/log_timings.py
new file mode 100644
index 000000000..b7a2305bb
--- /dev/null
+++ b/pySDC/implementations/hooks/log_timings.py
@@ -0,0 +1,341 @@
+import time
+from pySDC.core.hooks import Hooks
+
+try:
+    import cupy as cp
+except ImportError:
+    cp = None
+
+
+class Timings(Hooks):
+    """
+    Abstract base class for recoding timings
+
+    Attributes:
+        __t0_setup (float): private variable to get starting time of setup
+        __t0_run (float): private variable to get starting time of the run
+        __t0_predict (float): private variable to get starting time of the predictor
+        __t0_step (float): private variable to get starting time of the step
+        __t0_iteration (float): private variable to get starting time of the iteration
+        __t0_sweep (float): private variable to get starting time of the sweep
+        __t0_comm (list): private variable to get starting time of the communication
+        __t1_run (float): private variable to get end time of the run
+        __t1_predict (float): private variable to get end time of the predictor
+        __t1_step (float): private variable to get end time of the step
+        __t1_iteration (float): private variable to get end time of the iteration
+        __t1_sweep (float): private variable to get end time of the sweep
+        __t1_setup (float): private variable to get end time of setup
+        __t1_comm (list): private variable to hold timing of the communication (!)
+    """
+
+    prefix = ''
+
+    def _compute_time_elapsed(self, event_after, event_before):
+        raise NotImplementedError
+
+    def _get_event(self):
+        raise NotImplementedError
+
+    def __init__(self):
+        super().__init__()
+        self.__t0_setup = None
+        self.__t0_run = None
+        self.__t0_predict = None
+        self.__t0_step = None
+        self.__t0_iteration = None
+        self.__t0_sweep = None
+        self.__t0_comm = []
+        self.__t1_run = None
+        self.__t1_predict = None
+        self.__t1_step = None
+        self.__t1_iteration = None
+        self.__t1_sweep = None
+        self.__t1_setup = None
+        self.__t1_comm = []
+
+    def pre_setup(self, step, level_number):
+        """
+        Default routine called before setup starts
+
+        Args:
+            step (pySDC.Step.step): the current step
+            level_number (int): the current level number
+        """
+        super().pre_setup(step, level_number)
+        self.__t0_setup = self._get_event()
+
+    def pre_run(self, step, level_number):
+        """
+        Default routine called before time-loop starts
+
+        Args:
+            step (pySDC.Step.step): the current step
+            level_number (int): the current level number
+        """
+        super().pre_run(step, level_number)
+        self.__t0_run = self._get_event()
+
+    def pre_predict(self, step, level_number):
+        """
+        Default routine called before predictor starts
+
+        Args:
+            step (pySDC.Step.step): the current step
+            level_number (int): the current level number
+        """
+        super().pre_predict(step, level_number)
+        self.__t0_predict = self._get_event()
+
+    def pre_step(self, step, level_number):
+        """
+        Hook called before each step
+
+        Args:
+            step (pySDC.Step.step): the current step
+            level_number (int): the current level number
+        """
+        super().pre_step(step, level_number)
+        self.__t0_step = self._get_event()
+
+    def pre_iteration(self, step, level_number):
+        """
+        Default routine called before iteration starts
+
+        Args:
+            step (pySDC.Step.step): the current step
+            level_number (int): the current level number
+        """
+        super().pre_iteration(step, level_number)
+        self.__t0_iteration = self._get_event()
+
+    def pre_sweep(self, step, level_number):
+        """
+        Default routine called before sweep starts
+
+        Args:
+            step (pySDC.Step.step): the current step
+            level_number (int): the current level number
+        """
+        super().pre_sweep(step, level_number)
+        self.__t0_sweep = self._get_event()
+
+    def pre_comm(self, step, level_number):
+        """
+        Default routine called before communication starts
+
+        Args:
+            step (pySDC.Step.step): the current step
+            level_number (int): the current level number
+        """
+        super().pre_comm(step, level_number)
+        if len(self.__t0_comm) >= level_number + 1:
+            self.__t0_comm[level_number] = self._get_event()
+        else:
+            while len(self.__t0_comm) < level_number:
+                self.__t0_comm.append(None)
+            self.__t0_comm.append(self._get_event())
+            while len(self.__t1_comm) <= level_number:
+                self.__t1_comm.append(0.0)
+            assert len(self.__t0_comm) == level_number + 1
+            assert len(self.__t1_comm) == level_number + 1
+
+    def post_comm(self, step, level_number, add_to_stats=False):
+        """
+        Default routine called after each communication
+
+        Args:
+            step (pySDC.Step.step): the current step
+            level_number (int): the current level number
+            add_to_stats (bool): set if result should go to stats object
+        """
+        super().post_comm(step, level_number)
+        assert len(self.__t1_comm) >= level_number + 1
+        self.__t1_comm[level_number] += self._compute_time_elapsed(self._get_event(), self.__t0_comm[level_number])
+
+        if add_to_stats:
+            L = step.levels[level_number]
+
+            self.add_to_stats(
+                process=step.status.slot,
+                process_sweeper=L.sweep.rank,
+                time=L.time,
+                level=L.level_index,
+                iter=step.status.iter,
+                sweep=L.status.sweep,
+                type=f'{self.prefix}timing_comm',
+                value=self.__t1_comm[level_number],
+            )
+            self.__t1_comm[level_number] = 0.0
+
+    def post_sweep(self, step, level_number):
+        """
+        Default routine called after each sweep
+
+        Args:
+            step (pySDC.Step.step): the current step
+            level_number (int): the current level number
+        """
+        super().post_sweep(step, level_number)
+        self.__t1_sweep = self._get_event()
+
+        L = step.levels[level_number]
+
+        self.add_to_stats(
+            process=step.status.slot,
+            process_sweeper=L.sweep.rank,
+            time=L.time,
+            level=L.level_index,
+            iter=step.status.iter,
+            sweep=L.status.sweep,
+            type=f'{self.prefix}timing_sweep',
+            value=self._compute_time_elapsed(self.__t1_sweep, self.__t0_sweep),
+        )
+
+    def post_iteration(self, step, level_number):
+        """
+        Default routine called after each iteration
+
+        Args:
+            step (pySDC.Step.step): the current step
+            level_number (int): the current level number
+        """
+        super().post_iteration(step, level_number)
+        self.__t1_iteration = self._get_event()
+
+        L = step.levels[level_number]
+
+        self.add_to_stats(
+            process=step.status.slot,
+            process_sweeper=L.sweep.rank,
+            time=L.time,
+            level=L.level_index,
+            iter=step.status.iter,
+            sweep=L.status.sweep,
+            type=f'{self.prefix}timing_iteration',
+            value=self._compute_time_elapsed(self.__t1_iteration, self.__t0_iteration),
+        )
+
+    def post_step(self, step, level_number):
+        """
+        Default routine called after each step or block
+
+        Args:
+            step (pySDC.Step.step): the current step
+            level_number (int): the current level number
+        """
+        super().post_step(step, level_number)
+        self.__t1_step = self._get_event()
+
+        L = step.levels[level_number]
+
+        self.add_to_stats(
+            process=step.status.slot,
+            process_sweeper=L.sweep.rank,
+            time=L.time,
+            level=L.level_index,
+            iter=step.status.iter,
+            sweep=L.status.sweep,
+            type=f'{self.prefix}timing_step',
+            value=self._compute_time_elapsed(self.__t1_step, self.__t0_step),
+        )
+
+    def post_predict(self, step, level_number):
+        """
+        Default routine called after each predictor
+
+        Args:
+            step (pySDC.Step.step): the current step
+            level_number (int): the current level number
+        """
+        super().post_predict(step, level_number)
+        self.__t1_predict = self._get_event()
+
+        L = step.levels[level_number]
+
+        self.add_to_stats(
+            process=step.status.slot,
+            process_sweeper=L.sweep.rank,
+            time=L.time,
+            level=L.level_index,
+            iter=step.status.iter,
+            sweep=L.status.sweep,
+            type=f'{self.prefix}timing_predictor',
+            value=self._compute_time_elapsed(self.__t1_predict, self.__t0_predict),
+        )
+
+    def post_run(self, step, level_number):
+        """
+        Default routine called after each run
+
+        Args:
+            step (pySDC.Step.step): the current step
+            level_number (int): the current level number
+        """
+        super().post_run(step, level_number)
+        self.__t1_run = self._get_event()
+        t_run = self._compute_time_elapsed(self.__t1_run, self.__t0_run)
+
+        L = step.levels[level_number]
+
+        self.add_to_stats(
+            process=step.status.slot,
+            process_sweeper=L.sweep.rank,
+            time=L.time,
+            level=L.level_index,
+            iter=step.status.iter,
+            sweep=L.status.sweep,
+            type=f'{self.prefix}timing_run',
+            value=t_run,
+        )
+        self.logger.info(f'Finished run after {t_run:.2e}s')
+
+    def post_setup(self, step, level_number):
+        """
+        Default routine called after setup
+
+        Args:
+            step (pySDC.Step.step): the current step
+            level_number (int): the current level number
+        """
+        super().post_setup(step, level_number)
+        self.__t1_setup = self._get_event()
+
+        self.add_to_stats(
+            process=-1,
+            process_sweeper=-1,
+            time=-1,
+            level=-1,
+            iter=-1,
+            sweep=-1,
+            type=f'{self.prefix}timing_setup',
+            value=self._compute_time_elapsed(self.__t1_setup, self.__t0_setup),
+        )
+
+
+class CPUTimings(Timings):
+    """
+    Hook for recording CPU timings of important operations during a pySDC run.
+    """
+
+    def _compute_time_elapsed(self, event_after, event_before):
+        return event_after - event_before
+
+    def _get_event(self):
+        return time.perf_counter()
+
+
+class GPUTimings(Timings):
+    """
+    Hook for recording GPU timings of important operations during a pySDC run.
+    """
+
+    prefix = 'GPU_'
+
+    def _compute_time_elapsed(self, event_after, event_before):
+        event_after.synchronize()
+        return cp.cuda.get_elapsed_time(event_before, event_after) / 1e3
+
+    def _get_event(self):
+        event = cp.cuda.Event()
+        event.record()
+        return event
-- 
GitLab