From a9f84005b032b07013e3ff89faf67e76014e5022 Mon Sep 17 00:00:00 2001
From: leufen1 <l.leufen@fz-juelich.de>
Date: Fri, 20 Aug 2021 15:55:55 +0200
Subject: [PATCH] added cleanup, /close #321 when HPC system test works

---
 .../data_handler_single_station.py            |  9 ++++++
 mlair/data_handler/default_data_handler.py    |  3 +-
 mlair/helpers/helpers.py                      | 32 +++++++++++++++++++
 3 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/mlair/data_handler/data_handler_single_station.py b/mlair/data_handler/data_handler_single_station.py
index 3392e416..8c22b42f 100644
--- a/mlair/data_handler/data_handler_single_station.py
+++ b/mlair/data_handler/data_handler_single_station.py
@@ -5,6 +5,8 @@ __date__ = '2020-07-20'
 
 import copy
 import datetime as dt
+import gc
+
 import dill
 import hashlib
 import logging
@@ -107,6 +109,13 @@ class DataHandlerSingleStation(AbstractDataHandler):
 
         # create samples
         self.setup_samples()
+        self.clean_up()
+
+    def clean_up(self):
+        self._data = None
+        self.input_data = None
+        self.target_data = None
+        gc.collect()
 
     def __str__(self):
         return self.station[0]
diff --git a/mlair/data_handler/default_data_handler.py b/mlair/data_handler/default_data_handler.py
index a17de954..3157248f 100644
--- a/mlair/data_handler/default_data_handler.py
+++ b/mlair/data_handler/default_data_handler.py
@@ -307,8 +307,7 @@ class DefaultDataHandler(AbstractDataHandler):
         n_process = min([psutil.cpu_count(logical=False), len(set_stations), max_process])  # use only physical cpus
         if n_process > 1 and kwargs.get("use_multiprocessing", True) is True:  # parallel solution
             logging.info("use parallel transformation approach")
-            pool = multiprocessing.Pool(
-                min([psutil.cpu_count(logical=False), len(set_stations), 16]))  # use only physical cpus
+            pool = multiprocessing.Pool(n_process)  # use only physical cpus
             logging.info(f"running {getattr(pool, '_processes')} processes in parallel")
             output = [
                 pool.apply_async(f_proc, args=(cls.data_handler_transformation, station), kwds=sp_keys)
diff --git a/mlair/helpers/helpers.py b/mlair/helpers/helpers.py
index 5ddaa3ee..4cc7310d 100644
--- a/mlair/helpers/helpers.py
+++ b/mlair/helpers/helpers.py
@@ -4,6 +4,7 @@ __date__ = '2019-10-21'
 
 import inspect
 import math
+import sys
 
 import numpy as np
 import xarray as xr
@@ -179,3 +180,34 @@ def convert2xrda(arr: Union[xr.DataArray, xr.Dataset, np.ndarray, int, float],
             kwargs.update({'dims': dims, 'coords': coords})
 
         return xr.DataArray(arr, **kwargs)
+
+
+# def convert_size(size_bytes):
+#     if size_bytes == 0:
+#         return "0B"
+#     size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
+#     i = int(math.floor(math.log(size_bytes, 1024)))
+#     p = math.pow(1024, i)
+#     s = round(size_bytes / p, 2)
+#     return "%s %s" % (s, size_name[i])
+#
+#
+# def get_size(obj, seen=None):
+#     """Recursively finds size of objects"""
+#     size = sys.getsizeof(obj)
+#     if seen is None:
+#         seen = set()
+#     obj_id = id(obj)
+#     if obj_id in seen:
+#         return 0
+#     # Important mark as seen *before* entering recursion to gracefully handle
+#     # self-referential objects
+#     seen.add(obj_id)
+#     if isinstance(obj, dict):
+#         size += sum([get_size(v, seen) for v in obj.values()])
+#         size += sum([get_size(k, seen) for k in obj.keys()])
+#     elif hasattr(obj, '__dict__'):
+#         size += get_size(obj.__dict__, seen)
+#     elif hasattr(obj, '__iter__') and not isinstance(obj, (str, bytes, bytearray)):
+#         size += sum([get_size(i, seen) for i in obj])
+#     return size
-- 
GitLab