From c2cd4a13f9cec7c8fcfe1c42282b1e5dbdef4a92 Mon Sep 17 00:00:00 2001
From: Damian Alvarez <swmanage@jwlogin01.juwels>
Date: Fri, 4 Aug 2023 13:26:25 +0200
Subject: [PATCH] To get rid of the nvidia-driver headaches. But for older
 stages, let's preserve the module and structure.

---
 Custom_EasyBlocks/nvidia_driver.py            | 100 +++++++++++-------
 .../n/nvidia-driver/nvidia-driver-default.eb  |   7 +-
 .../n/nvidia-driver/nvidia-driver-default.eb  |  27 -----
 .../n/nvidia-driver/nvidia-driver-default.eb  |  27 -----
 .../n/nvidia-driver/nvidia-driver-default.eb  |  27 -----
 .../n/nvidia-driver/nvidia-driver-default.eb  |  27 -----
 .../n/nvidia-driver/nvidia-driver-default.eb  |  27 -----
 .../n/nvidia-driver/nvidia-driver-default.eb  |  27 -----
 8 files changed, 65 insertions(+), 204 deletions(-)
 delete mode 100644 Overlays/hdfml_overlay/n/nvidia-driver/nvidia-driver-default.eb
 delete mode 100644 Overlays/jureca_spr_overlay/n/nvidia-driver/nvidia-driver-default.eb
 delete mode 100644 Overlays/jurecadc_overlay/n/nvidia-driver/nvidia-driver-default.eb
 delete mode 100644 Overlays/jusuf_overlay/n/nvidia-driver/nvidia-driver-default.eb
 delete mode 100644 Overlays/juwels_overlay/n/nvidia-driver/nvidia-driver-default.eb
 delete mode 100644 Overlays/juwelsbooster_overlay/n/nvidia-driver/nvidia-driver-default.eb

diff --git a/Custom_EasyBlocks/nvidia_driver.py b/Custom_EasyBlocks/nvidia_driver.py
index 48d74f485..a8b1933b1 100644
--- a/Custom_EasyBlocks/nvidia_driver.py
+++ b/Custom_EasyBlocks/nvidia_driver.py
@@ -32,6 +32,7 @@ class EB_nvidia_minus_driver(Binary):
         """Support for generic 'default' modules with specific real versions"""
         extra_vars = {
             'realversion': [None, "Real version to be used when version = 'default'", CUSTOM],
+            'just_GL_libs': [False, "Install just GL-related libs", CUSTOM],
         }
         return extra_vars
 
@@ -68,33 +69,46 @@ class EB_nvidia_minus_driver(Binary):
         "Install NVIDIA libs simply by copying files. We can't use the installer because it requires root privileges."
 
         # list of libs
-        libs = expand_glob_paths([os.path.join(self.libsdir, 'lib*.so*')])
-        try:
-            libs += expand_glob_paths([os.path.join(self.libsdir, '*.la')])
-        except EasyBuildError:
-            self.log.info("No *.la files found. Proceeding without them.")
-        libs += [os.path.join(self.libsdir, 'nvidia_drv.so')]
-
-        # list of binaries
-        binaries = ['nvidia-bug-report.sh',
-                    'nvidia-cuda-mps-control',
-                    'nvidia-cuda-mps-server',
-                    'nvidia-debugdump',
-                    'nvidia-settings',
-                    'nvidia-smi',
-                    'nvidia-xconfig']
-        binaries = [os.path.join(self.libsdir, x) for x in binaries]
-
-        # list of manpages
-        manpages = ['nvidia-settings.1.gz',
-                    'nvidia-cuda-mps-control.1.gz',
-                    'nvidia-xconfig.1.gz',
-                    'nvidia-smi.1.gz']
-        manpages = [os.path.join(self.libsdir, x) for x in manpages]
+        if not self.cfg['just_GL_libs']:
+            libs = expand_glob_paths([os.path.join(self.libsdir, 'lib*.so*')])
+            try:
+                libs += expand_glob_paths([os.path.join(self.libsdir, '*.la')])
+            except EasyBuildError:
+                self.log.info("No *.la files found. Proceeding without them.")
+            libs += [os.path.join(self.libsdir, 'nvidia_drv.so')]
+        else:
+            libs = expand_glob_paths([os.path.join(self.libsdir, 'libEGL*.so*')])
+            libs += expand_glob_paths([os.path.join(self.libsdir, 'libGL*.so*')])
+            libs += expand_glob_paths([os.path.join(self.libsdir, 'libnvidia-egl*.so*')])
+            libs += expand_glob_paths([os.path.join(self.libsdir, 'libnvidia-gl*.so*')])
+            libs += expand_glob_paths([os.path.join(self.libsdir, 'libnvidia-ptx*.so*')])
+            libs += expand_glob_paths([os.path.join(self.libsdir, 'libnvidia-rtcore*.so*')])
+            libs += expand_glob_paths([os.path.join(self.libsdir, 'libnvidia-tls*.so*')])
+            libs += expand_glob_paths([os.path.join(self.libsdir, 'libnvidia-vulkan*.so*')])
+
+
+        if not self.cfg['just_GL_libs']:
+            # list of binaries
+            binaries = ['nvidia-bug-report.sh',
+                        'nvidia-cuda-mps-control',
+                        'nvidia-cuda-mps-server',
+                        'nvidia-debugdump',
+                        'nvidia-settings',
+                        'nvidia-smi',
+                        'nvidia-xconfig']
+            binaries = [os.path.join(self.libsdir, x) for x in binaries]
+
+            # list of manpages
+            manpages = ['nvidia-settings.1.gz',
+                        'nvidia-cuda-mps-control.1.gz',
+                        'nvidia-xconfig.1.gz',
+                        'nvidia-smi.1.gz']
+            manpages = [os.path.join(self.libsdir, x) for x in manpages]
+
+            copy(binaries, os.path.join(self.installdir, 'bin'))
+            copy(manpages, os.path.join(self.installdir, 'man', 'man1'))
 
         copy(libs, os.path.join(self.installdir, 'lib64'))
-        copy(binaries, os.path.join(self.installdir, 'bin'))
-        copy(manpages, os.path.join(self.installdir, 'man', 'man1'))
 
     def post_install_step(self):
         """Generate the appropriate symlinks"""
@@ -104,12 +118,13 @@ class EB_nvidia_minus_driver(Binary):
         # Run ldconfig to create missing symlinks (libcuda.so.1, etc)
         run_cmd("/usr/sbin/ldconfig -N %s" % libdir)
 
-        # Create an extra symlink for libcuda.so, otherwise PGI 19.X breaks
-        # Create an extra symlink for libnvidia-ml.so, otherwise MVAPICH2 doesn't find it if it doesn't rely on stubs
-        missing_links = ['libcuda.so', 'libnvidia-ml.so']
-        for missing_link in missing_links:
-            run_cmd("ln -s %s/%s.1 %s/%s" %
-                    (libdir, missing_link, libdir, missing_link))
+        if not self.cfg['just_GL_libs']:
+            # Create an extra symlink for libcuda.so, otherwise PGI 19.X breaks
+            # Create an extra symlink for libnvidia-ml.so, otherwise MVAPICH2 doesn't find it if it doesn't rely on stubs
+            missing_links = ['libcuda.so', 'libnvidia-ml.so']
+            for missing_link in missing_links:
+                run_cmd("ln -s %s/%s.1 %s/%s" %
+                        (libdir, missing_link, libdir, missing_link))
 
         super(EB_nvidia_minus_driver, self).post_install_step()
 
@@ -120,13 +135,26 @@ class EB_nvidia_minus_driver(Binary):
 
         chk_libdir = ["lib64"]
 
-        nvlibs = ["cuda"]
+        if not self.cfg['just_GL_libs']:
+            nvlibs = ["cuda"]
+            binaries = [os.path.join("bin", x) for x in ["nvidia-smi"]]
+            libs = [os.path.join("%s", "lib%s.%s.1") % (x, y, shlib_ext)
+                    for x in chk_libdir for y in nvlibs]
+        else:
+            nvlibs_0_suffix = ["EGL_nvidia", "GLX_nvidia"]
+            nvlibs_1_suffix = ["GLESv1_CM_nvidia"]
+            nvlibs_2_suffix = ["GLESv2_nvidia"]
+            binaries = []
+            libs = [os.path.join("%s", "lib%s.%s.0") % (x, y, shlib_ext)
+                    for x in chk_libdir for y in nvlibs_0_suffix]
+            libs += [os.path.join("%s", "lib%s.%s.1") % (x, y, shlib_ext)
+                     for x in chk_libdir for y in nvlibs_1_suffix]
+            libs += [os.path.join("%s", "lib%s.%s.2") % (x, y, shlib_ext)
+                     for x in chk_libdir for y in nvlibs_2_suffix]
+
         custom_paths = {
-            'files': [os.path.join("bin", x) for x in ["nvidia-smi"]] +
-            [os.path.join("%s", "lib%s.%s.1") % (x, y, shlib_ext)
-             for x in chk_libdir for y in nvlibs],
+            'files': binaries + libs,
             'dirs': [''],
         }
-
         super(EB_nvidia_minus_driver, self).sanity_check_step(
             custom_paths=custom_paths)
diff --git a/Golden_Repo/n/nvidia-driver/nvidia-driver-default.eb b/Golden_Repo/n/nvidia-driver/nvidia-driver-default.eb
index 006b90db0..1a3c6bfc1 100644
--- a/Golden_Repo/n/nvidia-driver/nvidia-driver-default.eb
+++ b/Golden_Repo/n/nvidia-driver/nvidia-driver-default.eb
@@ -17,11 +17,6 @@ source_urls = ['http://us.download.nvidia.com/tesla/%s/' % realversion]
 sources = ['NVIDIA-Linux-x86_64-%s.run' % realversion]
 checksums = ['0492ddc5b5e65aa00cbc762e8d6680205c8d08e103b7131087a15126aee495e9']
 
-# To avoid conflicts between NVML and the kernel driver
-postinstallcmds = ['rm %(installdir)s/lib64/libnvidia-ml.so*']
-
-modluafooter = '''
-add_property("arch","gpu")
-'''
+just_GL_libs = True
 
 moduleclass = 'system'
diff --git a/Overlays/hdfml_overlay/n/nvidia-driver/nvidia-driver-default.eb b/Overlays/hdfml_overlay/n/nvidia-driver/nvidia-driver-default.eb
deleted file mode 100644
index cdf59cd27..000000000
--- a/Overlays/hdfml_overlay/n/nvidia-driver/nvidia-driver-default.eb
+++ /dev/null
@@ -1,27 +0,0 @@
-name = 'nvidia-driver'
-version = 'default'
-realversion = '535.54.03'
-
-homepage = 'https://developer.nvidia.com/cuda-toolkit'
-description = f"""
-This is a set of libraries normally installed by the NVIDIA driver installer.
-
-The real version of this package is {realversion}.
-"""
-
-site_contacts = 'sc@fz-juelich.de'
-
-toolchain = SYSTEM
-
-source_urls = ['http://us.download.nvidia.com/tesla/%s/' % realversion]
-sources = ['NVIDIA-Linux-x86_64-%s.run' % realversion]
-checksums = ['454764f57ea1b9e19166a370f78be10e71f0626438fb197f726dc3caf05b4082']
-
-# To avoid conflicts between NVML and the kernel driver
-postinstallcmds = ['rm %(installdir)s/lib64/libnvidia-ml.so*']
-
-modluafooter = '''
-add_property("arch","gpu")
-'''
-
-moduleclass = 'system'
diff --git a/Overlays/jureca_spr_overlay/n/nvidia-driver/nvidia-driver-default.eb b/Overlays/jureca_spr_overlay/n/nvidia-driver/nvidia-driver-default.eb
deleted file mode 100644
index cdf59cd27..000000000
--- a/Overlays/jureca_spr_overlay/n/nvidia-driver/nvidia-driver-default.eb
+++ /dev/null
@@ -1,27 +0,0 @@
-name = 'nvidia-driver'
-version = 'default'
-realversion = '535.54.03'
-
-homepage = 'https://developer.nvidia.com/cuda-toolkit'
-description = f"""
-This is a set of libraries normally installed by the NVIDIA driver installer.
-
-The real version of this package is {realversion}.
-"""
-
-site_contacts = 'sc@fz-juelich.de'
-
-toolchain = SYSTEM
-
-source_urls = ['http://us.download.nvidia.com/tesla/%s/' % realversion]
-sources = ['NVIDIA-Linux-x86_64-%s.run' % realversion]
-checksums = ['454764f57ea1b9e19166a370f78be10e71f0626438fb197f726dc3caf05b4082']
-
-# To avoid conflicts between NVML and the kernel driver
-postinstallcmds = ['rm %(installdir)s/lib64/libnvidia-ml.so*']
-
-modluafooter = '''
-add_property("arch","gpu")
-'''
-
-moduleclass = 'system'
diff --git a/Overlays/jurecadc_overlay/n/nvidia-driver/nvidia-driver-default.eb b/Overlays/jurecadc_overlay/n/nvidia-driver/nvidia-driver-default.eb
deleted file mode 100644
index cdf59cd27..000000000
--- a/Overlays/jurecadc_overlay/n/nvidia-driver/nvidia-driver-default.eb
+++ /dev/null
@@ -1,27 +0,0 @@
-name = 'nvidia-driver'
-version = 'default'
-realversion = '535.54.03'
-
-homepage = 'https://developer.nvidia.com/cuda-toolkit'
-description = f"""
-This is a set of libraries normally installed by the NVIDIA driver installer.
-
-The real version of this package is {realversion}.
-"""
-
-site_contacts = 'sc@fz-juelich.de'
-
-toolchain = SYSTEM
-
-source_urls = ['http://us.download.nvidia.com/tesla/%s/' % realversion]
-sources = ['NVIDIA-Linux-x86_64-%s.run' % realversion]
-checksums = ['454764f57ea1b9e19166a370f78be10e71f0626438fb197f726dc3caf05b4082']
-
-# To avoid conflicts between NVML and the kernel driver
-postinstallcmds = ['rm %(installdir)s/lib64/libnvidia-ml.so*']
-
-modluafooter = '''
-add_property("arch","gpu")
-'''
-
-moduleclass = 'system'
diff --git a/Overlays/jusuf_overlay/n/nvidia-driver/nvidia-driver-default.eb b/Overlays/jusuf_overlay/n/nvidia-driver/nvidia-driver-default.eb
deleted file mode 100644
index cdf59cd27..000000000
--- a/Overlays/jusuf_overlay/n/nvidia-driver/nvidia-driver-default.eb
+++ /dev/null
@@ -1,27 +0,0 @@
-name = 'nvidia-driver'
-version = 'default'
-realversion = '535.54.03'
-
-homepage = 'https://developer.nvidia.com/cuda-toolkit'
-description = f"""
-This is a set of libraries normally installed by the NVIDIA driver installer.
-
-The real version of this package is {realversion}.
-"""
-
-site_contacts = 'sc@fz-juelich.de'
-
-toolchain = SYSTEM
-
-source_urls = ['http://us.download.nvidia.com/tesla/%s/' % realversion]
-sources = ['NVIDIA-Linux-x86_64-%s.run' % realversion]
-checksums = ['454764f57ea1b9e19166a370f78be10e71f0626438fb197f726dc3caf05b4082']
-
-# To avoid conflicts between NVML and the kernel driver
-postinstallcmds = ['rm %(installdir)s/lib64/libnvidia-ml.so*']
-
-modluafooter = '''
-add_property("arch","gpu")
-'''
-
-moduleclass = 'system'
diff --git a/Overlays/juwels_overlay/n/nvidia-driver/nvidia-driver-default.eb b/Overlays/juwels_overlay/n/nvidia-driver/nvidia-driver-default.eb
deleted file mode 100644
index cdf59cd27..000000000
--- a/Overlays/juwels_overlay/n/nvidia-driver/nvidia-driver-default.eb
+++ /dev/null
@@ -1,27 +0,0 @@
-name = 'nvidia-driver'
-version = 'default'
-realversion = '535.54.03'
-
-homepage = 'https://developer.nvidia.com/cuda-toolkit'
-description = f"""
-This is a set of libraries normally installed by the NVIDIA driver installer.
-
-The real version of this package is {realversion}.
-"""
-
-site_contacts = 'sc@fz-juelich.de'
-
-toolchain = SYSTEM
-
-source_urls = ['http://us.download.nvidia.com/tesla/%s/' % realversion]
-sources = ['NVIDIA-Linux-x86_64-%s.run' % realversion]
-checksums = ['454764f57ea1b9e19166a370f78be10e71f0626438fb197f726dc3caf05b4082']
-
-# To avoid conflicts between NVML and the kernel driver
-postinstallcmds = ['rm %(installdir)s/lib64/libnvidia-ml.so*']
-
-modluafooter = '''
-add_property("arch","gpu")
-'''
-
-moduleclass = 'system'
diff --git a/Overlays/juwelsbooster_overlay/n/nvidia-driver/nvidia-driver-default.eb b/Overlays/juwelsbooster_overlay/n/nvidia-driver/nvidia-driver-default.eb
deleted file mode 100644
index cdf59cd27..000000000
--- a/Overlays/juwelsbooster_overlay/n/nvidia-driver/nvidia-driver-default.eb
+++ /dev/null
@@ -1,27 +0,0 @@
-name = 'nvidia-driver'
-version = 'default'
-realversion = '535.54.03'
-
-homepage = 'https://developer.nvidia.com/cuda-toolkit'
-description = f"""
-This is a set of libraries normally installed by the NVIDIA driver installer.
-
-The real version of this package is {realversion}.
-"""
-
-site_contacts = 'sc@fz-juelich.de'
-
-toolchain = SYSTEM
-
-source_urls = ['http://us.download.nvidia.com/tesla/%s/' % realversion]
-sources = ['NVIDIA-Linux-x86_64-%s.run' % realversion]
-checksums = ['454764f57ea1b9e19166a370f78be10e71f0626438fb197f726dc3caf05b4082']
-
-# To avoid conflicts between NVML and the kernel driver
-postinstallcmds = ['rm %(installdir)s/lib64/libnvidia-ml.so*']
-
-modluafooter = '''
-add_property("arch","gpu")
-'''
-
-moduleclass = 'system'
-- 
GitLab