From c2cd4a13f9cec7c8fcfe1c42282b1e5dbdef4a92 Mon Sep 17 00:00:00 2001 From: Damian Alvarez <swmanage@jwlogin01.juwels> Date: Fri, 4 Aug 2023 13:26:25 +0200 Subject: [PATCH] To get rid of the nvidia-driver headaches. But for older stages, let's preserve the module and structure. --- Custom_EasyBlocks/nvidia_driver.py | 100 +++++++++++------- .../n/nvidia-driver/nvidia-driver-default.eb | 7 +- .../n/nvidia-driver/nvidia-driver-default.eb | 27 ----- .../n/nvidia-driver/nvidia-driver-default.eb | 27 ----- .../n/nvidia-driver/nvidia-driver-default.eb | 27 ----- .../n/nvidia-driver/nvidia-driver-default.eb | 27 ----- .../n/nvidia-driver/nvidia-driver-default.eb | 27 ----- .../n/nvidia-driver/nvidia-driver-default.eb | 27 ----- 8 files changed, 65 insertions(+), 204 deletions(-) delete mode 100644 Overlays/hdfml_overlay/n/nvidia-driver/nvidia-driver-default.eb delete mode 100644 Overlays/jureca_spr_overlay/n/nvidia-driver/nvidia-driver-default.eb delete mode 100644 Overlays/jurecadc_overlay/n/nvidia-driver/nvidia-driver-default.eb delete mode 100644 Overlays/jusuf_overlay/n/nvidia-driver/nvidia-driver-default.eb delete mode 100644 Overlays/juwels_overlay/n/nvidia-driver/nvidia-driver-default.eb delete mode 100644 Overlays/juwelsbooster_overlay/n/nvidia-driver/nvidia-driver-default.eb diff --git a/Custom_EasyBlocks/nvidia_driver.py b/Custom_EasyBlocks/nvidia_driver.py index 48d74f485..a8b1933b1 100644 --- a/Custom_EasyBlocks/nvidia_driver.py +++ b/Custom_EasyBlocks/nvidia_driver.py @@ -32,6 +32,7 @@ class EB_nvidia_minus_driver(Binary): """Support for generic 'default' modules with specific real versions""" extra_vars = { 'realversion': [None, "Real version to be used when version = 'default'", CUSTOM], + 'just_GL_libs': [False, "Install just GL-related libs", CUSTOM], } return extra_vars @@ -68,33 +69,46 @@ class EB_nvidia_minus_driver(Binary): "Install NVIDIA libs simply by copying files. We can't use the installer because it requires root privileges." # list of libs - libs = expand_glob_paths([os.path.join(self.libsdir, 'lib*.so*')]) - try: - libs += expand_glob_paths([os.path.join(self.libsdir, '*.la')]) - except EasyBuildError: - self.log.info("No *.la files found. Proceeding without them.") - libs += [os.path.join(self.libsdir, 'nvidia_drv.so')] - - # list of binaries - binaries = ['nvidia-bug-report.sh', - 'nvidia-cuda-mps-control', - 'nvidia-cuda-mps-server', - 'nvidia-debugdump', - 'nvidia-settings', - 'nvidia-smi', - 'nvidia-xconfig'] - binaries = [os.path.join(self.libsdir, x) for x in binaries] - - # list of manpages - manpages = ['nvidia-settings.1.gz', - 'nvidia-cuda-mps-control.1.gz', - 'nvidia-xconfig.1.gz', - 'nvidia-smi.1.gz'] - manpages = [os.path.join(self.libsdir, x) for x in manpages] + if not self.cfg['just_GL_libs']: + libs = expand_glob_paths([os.path.join(self.libsdir, 'lib*.so*')]) + try: + libs += expand_glob_paths([os.path.join(self.libsdir, '*.la')]) + except EasyBuildError: + self.log.info("No *.la files found. Proceeding without them.") + libs += [os.path.join(self.libsdir, 'nvidia_drv.so')] + else: + libs = expand_glob_paths([os.path.join(self.libsdir, 'libEGL*.so*')]) + libs += expand_glob_paths([os.path.join(self.libsdir, 'libGL*.so*')]) + libs += expand_glob_paths([os.path.join(self.libsdir, 'libnvidia-egl*.so*')]) + libs += expand_glob_paths([os.path.join(self.libsdir, 'libnvidia-gl*.so*')]) + libs += expand_glob_paths([os.path.join(self.libsdir, 'libnvidia-ptx*.so*')]) + libs += expand_glob_paths([os.path.join(self.libsdir, 'libnvidia-rtcore*.so*')]) + libs += expand_glob_paths([os.path.join(self.libsdir, 'libnvidia-tls*.so*')]) + libs += expand_glob_paths([os.path.join(self.libsdir, 'libnvidia-vulkan*.so*')]) + + + if not self.cfg['just_GL_libs']: + # list of binaries + binaries = ['nvidia-bug-report.sh', + 'nvidia-cuda-mps-control', + 'nvidia-cuda-mps-server', + 'nvidia-debugdump', + 'nvidia-settings', + 'nvidia-smi', + 'nvidia-xconfig'] + binaries = [os.path.join(self.libsdir, x) for x in binaries] + + # list of manpages + manpages = ['nvidia-settings.1.gz', + 'nvidia-cuda-mps-control.1.gz', + 'nvidia-xconfig.1.gz', + 'nvidia-smi.1.gz'] + manpages = [os.path.join(self.libsdir, x) for x in manpages] + + copy(binaries, os.path.join(self.installdir, 'bin')) + copy(manpages, os.path.join(self.installdir, 'man', 'man1')) copy(libs, os.path.join(self.installdir, 'lib64')) - copy(binaries, os.path.join(self.installdir, 'bin')) - copy(manpages, os.path.join(self.installdir, 'man', 'man1')) def post_install_step(self): """Generate the appropriate symlinks""" @@ -104,12 +118,13 @@ class EB_nvidia_minus_driver(Binary): # Run ldconfig to create missing symlinks (libcuda.so.1, etc) run_cmd("/usr/sbin/ldconfig -N %s" % libdir) - # Create an extra symlink for libcuda.so, otherwise PGI 19.X breaks - # Create an extra symlink for libnvidia-ml.so, otherwise MVAPICH2 doesn't find it if it doesn't rely on stubs - missing_links = ['libcuda.so', 'libnvidia-ml.so'] - for missing_link in missing_links: - run_cmd("ln -s %s/%s.1 %s/%s" % - (libdir, missing_link, libdir, missing_link)) + if not self.cfg['just_GL_libs']: + # Create an extra symlink for libcuda.so, otherwise PGI 19.X breaks + # Create an extra symlink for libnvidia-ml.so, otherwise MVAPICH2 doesn't find it if it doesn't rely on stubs + missing_links = ['libcuda.so', 'libnvidia-ml.so'] + for missing_link in missing_links: + run_cmd("ln -s %s/%s.1 %s/%s" % + (libdir, missing_link, libdir, missing_link)) super(EB_nvidia_minus_driver, self).post_install_step() @@ -120,13 +135,26 @@ class EB_nvidia_minus_driver(Binary): chk_libdir = ["lib64"] - nvlibs = ["cuda"] + if not self.cfg['just_GL_libs']: + nvlibs = ["cuda"] + binaries = [os.path.join("bin", x) for x in ["nvidia-smi"]] + libs = [os.path.join("%s", "lib%s.%s.1") % (x, y, shlib_ext) + for x in chk_libdir for y in nvlibs] + else: + nvlibs_0_suffix = ["EGL_nvidia", "GLX_nvidia"] + nvlibs_1_suffix = ["GLESv1_CM_nvidia"] + nvlibs_2_suffix = ["GLESv2_nvidia"] + binaries = [] + libs = [os.path.join("%s", "lib%s.%s.0") % (x, y, shlib_ext) + for x in chk_libdir for y in nvlibs_0_suffix] + libs += [os.path.join("%s", "lib%s.%s.1") % (x, y, shlib_ext) + for x in chk_libdir for y in nvlibs_1_suffix] + libs += [os.path.join("%s", "lib%s.%s.2") % (x, y, shlib_ext) + for x in chk_libdir for y in nvlibs_2_suffix] + custom_paths = { - 'files': [os.path.join("bin", x) for x in ["nvidia-smi"]] + - [os.path.join("%s", "lib%s.%s.1") % (x, y, shlib_ext) - for x in chk_libdir for y in nvlibs], + 'files': binaries + libs, 'dirs': [''], } - super(EB_nvidia_minus_driver, self).sanity_check_step( custom_paths=custom_paths) diff --git a/Golden_Repo/n/nvidia-driver/nvidia-driver-default.eb b/Golden_Repo/n/nvidia-driver/nvidia-driver-default.eb index 006b90db0..1a3c6bfc1 100644 --- a/Golden_Repo/n/nvidia-driver/nvidia-driver-default.eb +++ b/Golden_Repo/n/nvidia-driver/nvidia-driver-default.eb @@ -17,11 +17,6 @@ source_urls = ['http://us.download.nvidia.com/tesla/%s/' % realversion] sources = ['NVIDIA-Linux-x86_64-%s.run' % realversion] checksums = ['0492ddc5b5e65aa00cbc762e8d6680205c8d08e103b7131087a15126aee495e9'] -# To avoid conflicts between NVML and the kernel driver -postinstallcmds = ['rm %(installdir)s/lib64/libnvidia-ml.so*'] - -modluafooter = ''' -add_property("arch","gpu") -''' +just_GL_libs = True moduleclass = 'system' diff --git a/Overlays/hdfml_overlay/n/nvidia-driver/nvidia-driver-default.eb b/Overlays/hdfml_overlay/n/nvidia-driver/nvidia-driver-default.eb deleted file mode 100644 index cdf59cd27..000000000 --- a/Overlays/hdfml_overlay/n/nvidia-driver/nvidia-driver-default.eb +++ /dev/null @@ -1,27 +0,0 @@ -name = 'nvidia-driver' -version = 'default' -realversion = '535.54.03' - -homepage = 'https://developer.nvidia.com/cuda-toolkit' -description = f""" -This is a set of libraries normally installed by the NVIDIA driver installer. - -The real version of this package is {realversion}. -""" - -site_contacts = 'sc@fz-juelich.de' - -toolchain = SYSTEM - -source_urls = ['http://us.download.nvidia.com/tesla/%s/' % realversion] -sources = ['NVIDIA-Linux-x86_64-%s.run' % realversion] -checksums = ['454764f57ea1b9e19166a370f78be10e71f0626438fb197f726dc3caf05b4082'] - -# To avoid conflicts between NVML and the kernel driver -postinstallcmds = ['rm %(installdir)s/lib64/libnvidia-ml.so*'] - -modluafooter = ''' -add_property("arch","gpu") -''' - -moduleclass = 'system' diff --git a/Overlays/jureca_spr_overlay/n/nvidia-driver/nvidia-driver-default.eb b/Overlays/jureca_spr_overlay/n/nvidia-driver/nvidia-driver-default.eb deleted file mode 100644 index cdf59cd27..000000000 --- a/Overlays/jureca_spr_overlay/n/nvidia-driver/nvidia-driver-default.eb +++ /dev/null @@ -1,27 +0,0 @@ -name = 'nvidia-driver' -version = 'default' -realversion = '535.54.03' - -homepage = 'https://developer.nvidia.com/cuda-toolkit' -description = f""" -This is a set of libraries normally installed by the NVIDIA driver installer. - -The real version of this package is {realversion}. -""" - -site_contacts = 'sc@fz-juelich.de' - -toolchain = SYSTEM - -source_urls = ['http://us.download.nvidia.com/tesla/%s/' % realversion] -sources = ['NVIDIA-Linux-x86_64-%s.run' % realversion] -checksums = ['454764f57ea1b9e19166a370f78be10e71f0626438fb197f726dc3caf05b4082'] - -# To avoid conflicts between NVML and the kernel driver -postinstallcmds = ['rm %(installdir)s/lib64/libnvidia-ml.so*'] - -modluafooter = ''' -add_property("arch","gpu") -''' - -moduleclass = 'system' diff --git a/Overlays/jurecadc_overlay/n/nvidia-driver/nvidia-driver-default.eb b/Overlays/jurecadc_overlay/n/nvidia-driver/nvidia-driver-default.eb deleted file mode 100644 index cdf59cd27..000000000 --- a/Overlays/jurecadc_overlay/n/nvidia-driver/nvidia-driver-default.eb +++ /dev/null @@ -1,27 +0,0 @@ -name = 'nvidia-driver' -version = 'default' -realversion = '535.54.03' - -homepage = 'https://developer.nvidia.com/cuda-toolkit' -description = f""" -This is a set of libraries normally installed by the NVIDIA driver installer. - -The real version of this package is {realversion}. -""" - -site_contacts = 'sc@fz-juelich.de' - -toolchain = SYSTEM - -source_urls = ['http://us.download.nvidia.com/tesla/%s/' % realversion] -sources = ['NVIDIA-Linux-x86_64-%s.run' % realversion] -checksums = ['454764f57ea1b9e19166a370f78be10e71f0626438fb197f726dc3caf05b4082'] - -# To avoid conflicts between NVML and the kernel driver -postinstallcmds = ['rm %(installdir)s/lib64/libnvidia-ml.so*'] - -modluafooter = ''' -add_property("arch","gpu") -''' - -moduleclass = 'system' diff --git a/Overlays/jusuf_overlay/n/nvidia-driver/nvidia-driver-default.eb b/Overlays/jusuf_overlay/n/nvidia-driver/nvidia-driver-default.eb deleted file mode 100644 index cdf59cd27..000000000 --- a/Overlays/jusuf_overlay/n/nvidia-driver/nvidia-driver-default.eb +++ /dev/null @@ -1,27 +0,0 @@ -name = 'nvidia-driver' -version = 'default' -realversion = '535.54.03' - -homepage = 'https://developer.nvidia.com/cuda-toolkit' -description = f""" -This is a set of libraries normally installed by the NVIDIA driver installer. - -The real version of this package is {realversion}. -""" - -site_contacts = 'sc@fz-juelich.de' - -toolchain = SYSTEM - -source_urls = ['http://us.download.nvidia.com/tesla/%s/' % realversion] -sources = ['NVIDIA-Linux-x86_64-%s.run' % realversion] -checksums = ['454764f57ea1b9e19166a370f78be10e71f0626438fb197f726dc3caf05b4082'] - -# To avoid conflicts between NVML and the kernel driver -postinstallcmds = ['rm %(installdir)s/lib64/libnvidia-ml.so*'] - -modluafooter = ''' -add_property("arch","gpu") -''' - -moduleclass = 'system' diff --git a/Overlays/juwels_overlay/n/nvidia-driver/nvidia-driver-default.eb b/Overlays/juwels_overlay/n/nvidia-driver/nvidia-driver-default.eb deleted file mode 100644 index cdf59cd27..000000000 --- a/Overlays/juwels_overlay/n/nvidia-driver/nvidia-driver-default.eb +++ /dev/null @@ -1,27 +0,0 @@ -name = 'nvidia-driver' -version = 'default' -realversion = '535.54.03' - -homepage = 'https://developer.nvidia.com/cuda-toolkit' -description = f""" -This is a set of libraries normally installed by the NVIDIA driver installer. - -The real version of this package is {realversion}. -""" - -site_contacts = 'sc@fz-juelich.de' - -toolchain = SYSTEM - -source_urls = ['http://us.download.nvidia.com/tesla/%s/' % realversion] -sources = ['NVIDIA-Linux-x86_64-%s.run' % realversion] -checksums = ['454764f57ea1b9e19166a370f78be10e71f0626438fb197f726dc3caf05b4082'] - -# To avoid conflicts between NVML and the kernel driver -postinstallcmds = ['rm %(installdir)s/lib64/libnvidia-ml.so*'] - -modluafooter = ''' -add_property("arch","gpu") -''' - -moduleclass = 'system' diff --git a/Overlays/juwelsbooster_overlay/n/nvidia-driver/nvidia-driver-default.eb b/Overlays/juwelsbooster_overlay/n/nvidia-driver/nvidia-driver-default.eb deleted file mode 100644 index cdf59cd27..000000000 --- a/Overlays/juwelsbooster_overlay/n/nvidia-driver/nvidia-driver-default.eb +++ /dev/null @@ -1,27 +0,0 @@ -name = 'nvidia-driver' -version = 'default' -realversion = '535.54.03' - -homepage = 'https://developer.nvidia.com/cuda-toolkit' -description = f""" -This is a set of libraries normally installed by the NVIDIA driver installer. - -The real version of this package is {realversion}. -""" - -site_contacts = 'sc@fz-juelich.de' - -toolchain = SYSTEM - -source_urls = ['http://us.download.nvidia.com/tesla/%s/' % realversion] -sources = ['NVIDIA-Linux-x86_64-%s.run' % realversion] -checksums = ['454764f57ea1b9e19166a370f78be10e71f0626438fb197f726dc3caf05b4082'] - -# To avoid conflicts between NVML and the kernel driver -postinstallcmds = ['rm %(installdir)s/lib64/libnvidia-ml.so*'] - -modluafooter = ''' -add_property("arch","gpu") -''' - -moduleclass = 'system' -- GitLab