diff --git a/Golden_Repo/m/matplotlib/matplotlib-3.4.3-gcccoremkl-11.2.0-2021.4.0.eb b/Golden_Repo/m/matplotlib/matplotlib-3.4.3-gcccoremkl-11.2.0-2021.4.0.eb index c510d60b69f5bf6fe78430aa186a7a6e1cbfb7ae..acbb9fa3632437fb9848d8f0637011c494881209 100644 --- a/Golden_Repo/m/matplotlib/matplotlib-3.4.3-gcccoremkl-11.2.0-2021.4.0.eb +++ b/Golden_Repo/m/matplotlib/matplotlib-3.4.3-gcccoremkl-11.2.0-2021.4.0.eb @@ -21,7 +21,7 @@ dependencies = [ ('libpng', '1.6.37'), ('freetype', '2.11.0'), ('Tkinter', '%(pyver)s'), - ('Pillow-SIMD', '8.3.1'), + ('Pillow-SIMD', '9.0.1'), ('Qhull', '2020.2') ] diff --git a/Golden_Repo/p/Pillow-SIMD/Pillow-SIMD-9.0.1-GCCcore-11.2.0.eb b/Golden_Repo/p/Pillow-SIMD/Pillow-SIMD-9.0.1-GCCcore-11.2.0.eb new file mode 100644 index 0000000000000000000000000000000000000000..956920d44ff90bc2b356073d3e69e4be91038d3d --- /dev/null +++ b/Golden_Repo/p/Pillow-SIMD/Pillow-SIMD-9.0.1-GCCcore-11.2.0.eb @@ -0,0 +1,38 @@ +easyblock = 'PythonPackage' + +name = 'Pillow-SIMD' +version = '9.0.1' + +homepage = 'https://github.com/uploadcare/pillow-simd' +description = """Pillow is the 'friendly PIL fork' by Alex Clark and Contributors. + PIL is the Python Imaging Library by Fredrik Lundh and Contributors.""" + +toolchain = {'name': 'GCCcore', 'version': '11.2.0'} + +source_urls = ['https://github.com/uploadcare/pillow-simd/archive/'] +sources = ['%(version)s.tar.gz'] + +builddependencies = [('binutils', '2.37')] + +dependencies = [ + ('Python', '3.9.6'), + ('libjpeg-turbo', '2.1.1'), + ('libpng', '1.6.37'), + ('zlib', '1.2.11'), + ('LibTIFF', '4.3.0'), + ('freetype', '2.11.0') +] + +use_pip = True +download_dep_fail = True + +options = {'modulename': 'PIL'} + +sanity_check_paths = { + 'files': [], + 'dirs': ['lib/python%(pyshortver)s/site-packages/PIL'], +} + +sanity_pip_check = True + +moduleclass = 'vis' diff --git a/Golden_Repo/p/PyTorch/PyTorch-1.10.0_fix-test-dataloader-fixed-affinity.patch b/Golden_Repo/p/PyTorch/PyTorch-1.10.0_fix-test-dataloader-fixed-affinity.patch new file mode 100644 index 0000000000000000000000000000000000000000..fb66d1b66968f1dc54a1b7d3e352f2034a4f14a2 --- /dev/null +++ b/Golden_Repo/p/PyTorch/PyTorch-1.10.0_fix-test-dataloader-fixed-affinity.patch @@ -0,0 +1,40 @@ +# Author: Alexander Grund +# Avoid test failures in CGROUP environments +# See https://github.com/pytorch/pytorch/issues/44368 and https://github.com/pytorch/pytorch/pull/44369 +diff -Nru pytorch.orig/test/test_dataloader.py pytorch/test/test_dataloader.py +--- pytorch.orig/test/test_dataloader.py 2021-10-28 19:19:23.284526686 +0200 ++++ pytorch/test/test_dataloader.py 2021-10-28 19:21:31.860488973 +0200 +@@ -2374,22 +2374,27 @@ + after = os.sched_getaffinity(0) + return iter(after) + +- +-def worker_set_affinity(_): +- os.sched_setaffinity(0, [multiprocessing.cpu_count() - 1]) +- +- + @unittest.skipIf( + not hasattr(os, 'sched_setaffinity'), + "os.sched_setaffinity is not available") + class TestSetAffinity(TestCase): + def test_set_affinity_in_worker_init(self): ++ # Query the current affinity mask to avoid setting a disallowed one ++ old_affinity = os.sched_getaffinity(0) ++ if not old_affinity: ++ self.skipTest("No affinity information") ++ # Choose any ++ expected_affinity = list(old_affinity)[-1] ++ ++ def worker_set_affinity(_): ++ os.sched_setaffinity(0, [expected_affinity]) ++ + dataset = SetAffinityDataset() + + dataloader = torch.utils.data.DataLoader( + dataset, num_workers=2, worker_init_fn=worker_set_affinity) + for sample in dataloader: +- self.assertEqual(sample, [multiprocessing.cpu_count() - 1]) ++ self.assertEqual(sample, [expected_affinity]) + + class ConvDataset(Dataset): + def __init__(self): diff --git a/Golden_Repo/p/PyTorch/PyTorch-1.10.0_skip_cmake_rpath.patch b/Golden_Repo/p/PyTorch/PyTorch-1.10.0_skip_cmake_rpath.patch new file mode 100644 index 0000000000000000000000000000000000000000..09469cd45b4f59fc0a1a49e9477c97ed214cb866 --- /dev/null +++ b/Golden_Repo/p/PyTorch/PyTorch-1.10.0_skip_cmake_rpath.patch @@ -0,0 +1,195 @@ +# Author: Caspar van Leeuwen +# PyTorch's CMAKE configuration by default sets RUNPATH on libraries if they link other libraries +# that are outside the build tree, which is done because of the CMAKE config on +# https://github.com/pytorch/pytorch/blob/v1.10.0/cmake/Dependencies.cmake#L10. +# This provides problems, since the cuda stubs library path then also gets added to the RUNPATH. +# As a result, at runtime, the stub version of things like libcuda.so.1 gets picked up, instead of the real drivers +# See https://github.com/easybuilders/easybuild-easyconfigs/issues/14359 +# This line https://github.com/pytorch/pytorch/blob/v1.10.0/cmake/Dependencies.cmake#L16 +# Makes sure that any path that is linked, is also added to the RUNPATH. +# This has been reported upstream in https://github.com/pytorch/pytorch/issues/35418 +# and a fix was attempted in https://github.com/pytorch/pytorch/pull/37737 but it was reverted +# +# This EasyBuild patch changes behavior for the libraries that were failing, i.e. the ones in this list: +# https://github.com/easybuilders/easybuild-easyconfigs/issues/14359#issuecomment-970479904 +# This is done by setting INSTALL_RPATH_USE_LINK_PATH to false, and instead, specifying the RPATH +# explicitely by defining INSTALL_RPATH, but only adding directories that do not match to the "stubs" regex +# It has been upstreamed in this PR https://github.com/pytorch/pytorch/pull/68912 (not accepted yet at the time of writing) +diff -Nru pytorch.orig/caffe2/CMakeLists.txt pytorch/caffe2/CMakeLists.txt +--- pytorch.orig/caffe2/CMakeLists.txt 2021-11-17 11:46:01.797337624 +0100 ++++ pytorch/caffe2/CMakeLists.txt 2021-11-18 19:05:35.637707235 +0100 +@@ -630,8 +630,33 @@ + else() + set(DELAY_LOAD_FLAGS "") + endif() +- target_link_libraries(caffe2_nvrtc ${CUDA_NVRTC} ${CUDA_CUDA_LIB} ${CUDA_NVRTC_LIB} ${DELAY_LOAD_FLAGS}) ++ # message("CUDA_NVRTC: ${CUDA_NVRTC}") ++ # message("CUDA_NVRTC_LIB: ${CUDA_NVRTC_LIB}") ++ # message("CUDA_CUDA_LIB: ${CUDA_CUDA_LIB}") ++ # message("DELAY_LOAD_FLAGS: ${DELAY_LOAD_FLAGS}") ++ # if(CUDA_CUDA_LIB MATCHES "stubs") ++ # message("stubs libraries found in the CUDA_CUDA_LIB: ${CUDA_CUDA_LIB}") ++ # else() ++ # message("Stubs libs not found in CUDA_CUDA_LIB: ${CUDA_CUDA_LIB}") ++ # endif() ++ # Make sure the CUDA stubs folder doesn't end up in the RPATH of CAFFE2_NVRTC: ++ set(CAFFE2_NVRTC_LIBS ${CUDA_NVRTC} ${CUDA_CUDA_LIB} ${CUDA_NVRTC_LIB}) ++ foreach(LIB IN LISTS CAFFE2_NVRTC_LIBS) ++ message("LIB: ${LIB}") ++ if(LIB MATCHES "stubs") ++ message("Filtering ${LIB} from being set in caffe2_nvrtc's RPATH, because it appears to point to the CUDA stubs directory, which should not be RPATHed.") ++ else() ++ cmake_path(GET LIB PARENT_PATH LIB_PATH) ++ message("LIBPATH: ${LIB_PATH}") ++ list(APPEND CAFFE2_NVRTC_RPATH ${LIB_PATH}) ++ endif() ++ endforeach() ++ message("CAFFE2_NVRTC_RPATH: ${CAFFE2_NVRTC_RPATH}") ++ set_target_properties(caffe2_nvrtc PROPERTIES INSTALL_RPATH_USE_LINK_PATH FALSE) ++ set_target_properties(caffe2_nvrtc PROPERTIES INSTALL_RPATH "${CAFFE2_NVRTC_RPATH}") ++ target_link_libraries(caffe2_nvrtc ${CAFFE2_NVRTC_LIBS} ${DELAY_LOAD_FLAGS}) + target_include_directories(caffe2_nvrtc PRIVATE ${CUDA_INCLUDE_DIRS}) ++# message(FATAL_ERROR "STOP HERE, we're debugging") + install(TARGETS caffe2_nvrtc DESTINATION "${TORCH_INSTALL_LIB_DIR}") + if(USE_NCCL AND BUILD_SPLIT_CUDA) + list(APPEND Caffe2_GPU_SRCS_CPP +diff -Nru pytorch.orig/test/cpp/api/CMakeLists.txt pytorch/test/cpp/api/CMakeLists.txt +--- pytorch.orig/test/cpp/api/CMakeLists.txt 2021-11-17 11:46:02.991350652 +0100 ++++ pytorch/test/cpp/api/CMakeLists.txt 2021-11-18 19:06:41.207423777 +0100 +@@ -61,6 +61,22 @@ + ${CUDA_CUDA_LIB} + ${TORCH_CUDA_LIBRARIES}) + ++ # Make sure the CUDA stubs folder doesn't end up in the RPATH of test_api: ++ set(TEST_API_LIBS ${CUDA_LIBRARIES} ${CUDA_NVRTC_LIB} ${CUDA_CUDA_LIB} ${TORCH_CUDA_LIBRARIES}) ++ foreach(LIB IN LISTS TEST_API_LIBS) ++ message("LIB: ${LIB}") ++ if(LIB MATCHES "stubs") ++ message("Filtering ${LIB} from being set in caffe2_nvrtc's RPATH, because it appears to point to the CUDA stubs directory, which should not be RPATHed.") ++ else() ++ cmake_path(GET LIB PARENT_PATH LIB_PATH) ++ message("LIBPATH: ${LIB_PATH}") ++ list(APPEND TEST_API_RPATH ${LIB_PATH}) ++ endif() ++ endforeach() ++ message("TEST_API_RPATH: ${TEST_API_RPATH}") ++ set_target_properties(test_api PROPERTIES INSTALL_RPATH_USE_LINK_PATH FALSE) ++ set_target_properties(test_api PROPERTIES INSTALL_RPATH "${TEST_API_RPATH}") ++ + target_compile_definitions(test_api PRIVATE "USE_CUDA") + endif() + +diff -Nru pytorch.orig/test/cpp/dist_autograd/CMakeLists.txt pytorch/test/cpp/dist_autograd/CMakeLists.txt +--- pytorch.orig/test/cpp/dist_autograd/CMakeLists.txt 2021-11-17 11:46:02.993350674 +0100 ++++ pytorch/test/cpp/dist_autograd/CMakeLists.txt 2021-11-18 19:06:18.389174421 +0100 +@@ -16,6 +16,22 @@ + ${CUDA_CUDA_LIB} + ${TORCH_CUDA_LIBRARIES}) + ++ # Make sure the CUDA stubs folder doesn't end up in the RPATH of test_dist_autograd: ++ set(DIST_AUTOGRAD_LIBS ${CUDA_LIBRARIES} ${CUDA_NVRTC_LIB} ${CUDA_CUDA_LIB} ${TORCH_CUDA_LIBRARIES}) ++ foreach(LIB IN LISTS DIST_AUTOGRAD_LIBS) ++ message("LIB: ${LIB}") ++ if(LIB MATCHES "stubs") ++ message("Filtering ${LIB} from being set in caffe2_nvrtc's RPATH, because it appears to point to the CUDA stubs directory, which should not be RPATHed.") ++ else() ++ cmake_path(GET LIB PARENT_PATH LIB_PATH) ++ message("LIBPATH: ${LIB_PATH}") ++ list(APPEND DIST_AUTOGRAD_RPATH ${LIB_PATH}) ++ endif() ++ endforeach() ++ message("DIST_AUTOGRAD_RPATH: ${DIST_AUTOGRAD_RPATH}") ++ set_target_properties(test_dist_autograd PROPERTIES INSTALL_RPATH_USE_LINK_PATH FALSE) ++ set_target_properties(test_dist_autograd PROPERTIES INSTALL_RPATH "${DIST_AUTOGRAD_RPATH}") ++ + target_compile_definitions(test_dist_autograd PRIVATE "USE_CUDA") + endif() + +diff -Nru pytorch.orig/test/cpp/jit/CMakeLists.txt pytorch/test/cpp/jit/CMakeLists.txt +--- pytorch.orig/test/cpp/jit/CMakeLists.txt 2021-11-17 11:46:02.989350630 +0100 ++++ pytorch/test/cpp/jit/CMakeLists.txt 2021-11-18 19:05:41.396770168 +0100 +@@ -94,6 +94,7 @@ + list(APPEND JIT_TEST_DEPENDENCIES onnx_library) + endif(MSVC) + ++ + target_link_libraries(test_jit PRIVATE ${JIT_TEST_DEPENDENCIES}) + target_include_directories(test_jit PRIVATE ${ATen_CPU_INCLUDE}) + +@@ -109,6 +110,22 @@ + ${CUDA_CUDA_LIB} + ${TORCH_CUDA_LIBRARIES}) + ++ # Make sure the CUDA stubs folder doesn't end up in the RPATH of test_jit: ++ set(TEST_JIT_LIBS ${CUDA_LIBRARIES} ${CUDA_NVRTC_LIB} ${CUDA_CUDA_LIB} ${TORCH_CUDA_LIBRARIES}) ++ foreach(LIB IN LISTS TEST_JIT_LIBS) ++ message("LIB: ${LIB}") ++ if(LIB MATCHES "stubs") ++ message("Filtering ${LIB} from being set in test_jit's RPATH, because it appears to point to the CUDA stubs directory, which should not be RPATHed.") ++ else() ++ cmake_path(GET LIB PARENT_PATH LIB_PATH) ++ message("LIBPATH: ${LIB_PATH}") ++ list(APPEND TEST_JIT_RPATH ${LIB_PATH}) ++ endif() ++ endforeach() ++ message("TEST_JIT_RPATH: ${TEST_JIT_RPATH}") ++ set_target_properties(test_jit PROPERTIES INSTALL_RPATH_USE_LINK_PATH FALSE) ++ set_target_properties(test_jit PROPERTIES INSTALL_RPATH "${TEST_JIT_RPATH}") ++ + target_compile_definitions(test_jit PRIVATE USE_CUDA) + elseif(USE_ROCM) + target_link_libraries(test_jit PRIVATE +diff -Nru pytorch.orig/test/cpp/rpc/CMakeLists.txt pytorch/test/cpp/rpc/CMakeLists.txt +--- pytorch.orig/test/cpp/rpc/CMakeLists.txt 2021-11-17 11:46:02.991350652 +0100 ++++ pytorch/test/cpp/rpc/CMakeLists.txt 2021-11-18 19:06:30.502306793 +0100 +@@ -39,6 +39,22 @@ + ${CUDA_CUDA_LIB} + ${TORCH_CUDA_LIBRARIES}) + ++ # Make sure the CUDA stubs folder doesn't end up in the RPATH of test_cpp_rpc: ++ set(CPP_RPC_LIBS ${CUDA_LIBRARIES} ${CUDA_NVRTC_LIB} ${CUDA_CUDA_LIB} ${TORCH_CUDA_LIBRARIES}) ++ foreach(LIB IN LISTS CPP_RPC_LIBS) ++ message("LIB: ${LIB}") ++ if(LIB MATCHES "stubs") ++ message("Filtering ${LIB} from being set in caffe2_nvrtc's RPATH, because it appears to point to the CUDA stubs directory, which should not be RPATHed.") ++ else() ++ cmake_path(GET LIB PARENT_PATH LIB_PATH) ++ message("LIBPATH: ${LIB_PATH}") ++ list(APPEND CPP_RPC_RPATH ${LIB_PATH}) ++ endif() ++ endforeach() ++ message("CPP_RPC_RPATH: ${CPP_RPC_RPATH}") ++ set_target_properties(test_cpp_rpc PROPERTIES INSTALL_RPATH_USE_LINK_PATH FALSE) ++ set_target_properties(test_cpp_rpc PROPERTIES INSTALL_RPATH "${CPP_RPC_RPATH}") ++ + target_compile_definitions(test_cpp_rpc PRIVATE "USE_CUDA") + endif() + +diff -Nru pytorch.orig/test/cpp/tensorexpr/CMakeLists.txt pytorch/test/cpp/tensorexpr/CMakeLists.txt +--- pytorch.orig/test/cpp/tensorexpr/CMakeLists.txt 2021-11-17 11:46:02.993350674 +0100 ++++ pytorch/test/cpp/tensorexpr/CMakeLists.txt 2021-11-18 19:06:00.988984273 +0100 +@@ -62,6 +62,24 @@ + ${CUDA_CUDA_LIB} + ${TORCH_CUDA_LIBRARIES}) + target_compile_definitions(tutorial_tensorexpr PRIVATE USE_CUDA) ++ ++ # Make sure the CUDA stubs folder doesn't end up in the RPATH of tutorial_tensorexpr: ++ set(CUDA_LINK_LIBS ${CUDA_LIBRARIES} ${CUDA_NVRTC_LIB} ${CUDA_CUDA_LIB} ${TORCH_CUDA_LIBRARIES}) ++ foreach(LIB IN LISTS CUDA_LINK_LIBS) ++ message("LIB: ${LIB}") ++ if(LIB MATCHES "stubs") ++ message("Filtering ${LIB} from being set in test_tensorexpr and tutorial_tensorexpr RPATH, because it appears to point to the CUDA stubs directory, which should not be RPATHed.") ++ else() ++ cmake_path(GET LIB PARENT_PATH LIB_PATH) ++ message("LIBPATH: ${LIB_PATH}") ++ list(APPEND TENSOREXPR_RPATH ${LIB_PATH}) ++ endif() ++ endforeach() ++ message("TENSOREXPR_RPATH: ${TENSOREXPR_RPATH}") ++ set_target_properties(test_tensorexpr PROPERTIES INSTALL_RPATH_USE_LINK_PATH FALSE) ++ set_target_properties(test_tensorexpr PROPERTIES INSTALL_RPATH "${TENSOREXPR_RPATH}") ++ set_target_properties(tutorial_tensorexpr PROPERTIES INSTALL_RPATH_USE_LINK_PATH FALSE) ++ set_target_properties(tutorial_tensorexpr PROPERTIES INSTALL_RPATH "${TENSOREXPR_RPATH}") + elseif(USE_ROCM) + target_link_libraries(test_tensorexpr PRIVATE + ${ROCM_HIPRTC_LIB} diff --git a/Golden_Repo/p/PyTorch/PyTorch-1.11-gcccoremkl-11.2.0-2021.4.0-CUDA-11.5.eb b/Golden_Repo/p/PyTorch/PyTorch-1.11-gcccoremkl-11.2.0-2021.4.0-CUDA-11.5.eb new file mode 100644 index 0000000000000000000000000000000000000000..14935388ac98d2a0e241026e4eaf75a539c2605c --- /dev/null +++ b/Golden_Repo/p/PyTorch/PyTorch-1.11-gcccoremkl-11.2.0-2021.4.0-CUDA-11.5.eb @@ -0,0 +1,183 @@ +name = 'PyTorch' +version = '1.11' +versionsuffix = '-CUDA-%(cudaver)s' + +homepage = 'https://pytorch.org/' +description = """Tensors and Dynamic neural networks in Python with strong GPU acceleration. +PyTorch is a deep learning framework that puts Python first.""" + +toolchain = {'name': 'gcccoremkl', 'version': '11.2.0-2021.4.0'} +toolchainopts = {'openmp': True} +# toolchainopts = {'cstd': 'c++11'} + +sources = [{ + 'filename': '%(name)s-%(version)s.tar.gz', + 'git_config': { + 'url': 'https://github.com/pytorch', + 'repo_name': 'pytorch', + 'tag': 'v1.11.0', + 'recursive': True, + }, +}] +patches = [ + 'PyTorch-1.7.0_avoid-nan-in-test-torch.patch', + 'PyTorch-1.7.0_disable-dev-shm-test.patch', + # 'PyTorch-1.7.1_correctly-pass-jit_opt_level.patch', + 'PyTorch-1.8.1_dont-use-gpu-ccc-in-test.patch', + # 'PyTorch-1.8.1_increase-distributed-test-timeout.patch', + 'PyTorch-1.9.0_limit-world-size-for-zero-redundancy-opt-test.patch', + 'PyTorch-1.10.0_fix-test-dataloader-fixed-affinity.patch', + # 'PyTorch-1.10.0_fix-alias-violation-in-bitwise-ops.patch', + # 'PyTorch-1.10.0_fix-faulty-asserts-and-skip-test.patch', + # 'PyTorch-1.10.0_fix-test-cond-cpu.patch', + # 'PyTorch-1.10.0_fix-vnni-detection.patch', + # 'PyTorch-1.10.0_increase_zero_optimizer_test_tolerance.patch', + # 'PyTorch-1.10.0_skip_failing_ops_tests.patch', + # 'PyTorch-1.10.0_skip_nan_tests_openblas.patch', + 'PyTorch-1.10.0_skip_cmake_rpath.patch', + # 'PyTorch-1.10.0_fix-gcc11-ideep.patch', + # 'PyTorch-1.10.0_fix_gcc11_nullpointer.patch', + # 'cub-lint.yaml.patch', + # 'cub-cub.cuh.patch', + #('cub-cub-definitions.patch', 1), + # 'cub-context_gpu.patch', + # 'cub-accuracy_op.patch', + # 'cub-affine-channel_op.patch', + # 'cub-arg_ops.patch', + # 'cub-batch_moments_op.patch', + # 'cub-batch_sparse_to_dense_op.patch', + # 'cub-boolean_mask_ops.patch', + # 'cub-cross_entropy.patch', + # 'cub-distance_op.patch', + # 'cub-elementwise_div_op.patch', + # 'cub-elementwise_linear_op.patch', + # 'cub-elementwise_mul_op.patch', + # 'cub-elementwise_ops.patch', + # 'cub-find_op.patch', + # 'cub-generate_proposals_op.patch', + # 'cub-normalize_ops.patch', + # 'cub-one_hot_ops.patch', + # 'cub-pack_segments.patch', + # 'cub-prelu_op.patch', + # 'cub-reduce_front_back_max_ops.patch', + # 'cub-reduce_front_back_sum_mean_ops.patch', + # 'cub-reduction_ops.patch', + # 'cub-rmac_regions_op.patch', + # 'cub-segment_reduction_op_gpu.patch', + # 'cub-sequence_ops.patch', + # 'cub-softmax_ops.patch', + # 'cub-spatial_batch_norm_op_impl.patch', + # 'cub-adagrad_fused_op_gpu.patch', + # 'cub-adagrad_op_gpu.patch', + # 'cub-adam_op_gpu.patch', + #('cub-cub_namespace.patch', 1), + # 'cub-reduce.patch', + # 'cub-math-gpu.patch', + # 'cub-CMake-Dependencies.patch', + + +] + +osdependencies = [OS_PKG_IBVERBS_DEV] + +builddependencies = [ + ('CMake', '3.21.1'), + ('hypothesis', '6.14.6'), +] + +dependencies = [ + ('CUDA', '11.5', '', True), + ('Ninja', '1.10.2'), # Required for JIT compilation of C++ extensions + ('Python', '3.9.6'), + ('protobuf', '3.17.3'), + ('protobuf-python', '3.17.3'), + ('pybind11', '2.7.1'), + ('SciPy-bundle', '2021.10'), + ('typing-extensions', '3.10.0.0'), + ('PyYAML', '5.4.1'), + ('MPFR', '4.1.0'), + ('GMP', '6.2.1'), + ('numactl', '2.0.14', '', SYSTEM), + ('FFmpeg', '4.4.1'), + ('Pillow-SIMD', '9.0.1'), + ('cuDNN', '8.3.1.22', '-CUDA-%(cudaver)s', True), + ('magma', '2.6.1', '-CUDA-%(cudaver)s'), + ('NCCL', '2.11.4', '-CUDA-%(cudaver)s'), + ('expecttest', '0.1.3'), +] + +# default CUDA compute capabilities to use (override via --cuda-compute-capabilities) +cuda_compute_capabilities = ['6.0', '6.1', '7.0', '7.2', '7.5', '8.0', '8.6'] + +custom_opts = ["USE_CUPTI_SO=1"] +configopts = 'MKL_THREADING_LAYER=sequential CFLAGS="$CFLAGS -fopenmp" CXXFLAGS="$CXXFLAGS -fopenmp" LDFLAGS=-fopenmp' + +excluded_tests = { + '': [ + # Bad tests: https://github.com/pytorch/pytorch/issues/60260 + 'distributed/elastic/utils/distributed_test', + 'distributed/elastic/multiprocessing/api_test', + # These tests fail on A10s at the very least, they time out forever no matter how long the timeout is. + # Possibly related to NCCL 2.8.3: https://docs.nvidia.com/deeplearning/nccl/release-notes/rel_2-8-3.html + # 'distributed/test_distributed_fork', + 'distributed/test_distributed_spawn', + # Fails on A10s: https://github.com/pytorch/pytorch/issues/63079 + 'test_optim', + # Test from this suite timeout often. The process group backend is deprecated anyway + # 'distributed/rpc/test_process_group_agent', + 'test_jit', + 'test_jit_cuda_fuser', + 'test_jit_legacy', + 'test_jit_profiling', + 'test_xnnpack_integration', + 'distributed/_shard/sharded_optim/test_sharded_optim', + 'distributed/_shard/sharded_tensor/ops/test_linear', + 'distributed/_shard/sharded_tensor/test_megatron_prototype', + 'distributions/test_distributions', + 'test_cpp_extensions_jit', + 'distributed/rpc/test_tensorpipe_agent', + 'test_ops', + 'distributed/fsdp/test_fsdp_memory', # fails on hdfml + 'distributed/fsdp/test_fsdp_overlap', # fails on hdfml + 'test_autograd', # fails on jureca dc and deep + 'test_cuda', # fails on jureca dc + 'test_multiprocessing', # fails on jureca dc + 'test_nn', # fails on jureca dc + 'test_profiler', # fails on jureca dc + 'test_quantization', # fails on jureca dc + 'distributed/_shard/sharded_tensor/test_sharded_tensor', # fails on deep + 'distributed/algorithms/test_join', # fails on deep and jureca dc + 'distributed/fsdp/test_fsdp_checkpoint', # fails on deep and jureca dc + 'distributed/fsdp/test_fsdp_core', # fails on deep and jureca dc + 'distributed/fsdp/test_fsdp_freezing_weights', # fails on deep and jureca dc + 'distributed/fsdp/test_fsdp_memory', # fails on deep + 'distributed/fsdp/test_fsdp_multiple_forward', # fails on deep and jureca dc + 'distributed/fsdp/test_fsdp_multiple_wrapping', # fails on deep and jureca dc + 'distributed/fsdp/test_fsdp_overlap', # fails on deep + 'distributed/fsdp/test_fsdp_pure_fp16', # fails on deep and jureca dc + 'distributed/fsdp/test_fsdp_uneven', # fails on deep and jureca dc + 'distributed/fsdp/test_wrap', # fails on deep and jureca dc + 'distributed/optim/test_zero_redundancy_optimizer', # fails on deep and jureca dc + 'distributed/rpc/cuda/test_tensorpipe_agent', # fails on deep + 'distributed/rpc/test_faulty_agent', # fails on deep + 'distributed/test_c10d_gloo', # fails on deep + 'test_model_dump', # fails on deep + 'distributed/test_c10d_nccl', # fails on jureca dc + 'distributed/test_c10d_spawn_nccl', # fails on jureca dc + 'distributed/test_data_parallel', # fails on jureca dc + + ] +} + +runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --continue-through-error --verbose %(excluded_tests)s' + +# The readelf sanity check can be taken out once the TestRPATH test from https://github.com/pytorch/pytorch/pull/68912 +# is accepted, since it is then checked as part of the PyTorch test suite +local_libcaffe2 = "$EBROOTPYTORCH/lib/python%%(pyshortver)s/site-packages/torch/lib/libcaffe2_nvrtc.%s" % SHLIB_EXT +sanity_check_commands = [ + "python -c 'import torch'", + "readelf -d %s | egrep 'RPATH|RUNPATH' | grep -v stubs" % local_libcaffe2, +] +tests = ['PyTorch-check-cpp-extension.py'] + +moduleclass = 'devel' diff --git a/Golden_Repo/p/PyTorch/PyTorch-1.7.0_avoid-nan-in-test-torch.patch b/Golden_Repo/p/PyTorch/PyTorch-1.7.0_avoid-nan-in-test-torch.patch new file mode 100644 index 0000000000000000000000000000000000000000..a14c146be550ee16e52e7c02400e55b358c058ae --- /dev/null +++ b/Golden_Repo/p/PyTorch/PyTorch-1.7.0_avoid-nan-in-test-torch.patch @@ -0,0 +1,18 @@ +This test uses in-place operations which may generate NaNs making subsequent tests fail +See https://github.com/pytorch/pytorch/issues/48591 + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/test_torch.py b/test/test_torch.py +index 1f3f568f7b..237fb030f6 100644 +--- a/test/test_torch.py ++++ b/test/test_torch.py +@@ -15060,7 +15060,7 @@ class TestTorchDeviceType(TestCase): + x_c = x.contiguous() + y_c = y.contiguous() + result_c = fn(x_c, y_c) +- result = fn(x, y) ++ result = fn(x.clone(), y) + self.assertEqual(result, result_c) + self.assertTrue( + result.is_contiguous(memory_format=memory_format), diff --git a/Golden_Repo/p/PyTorch/PyTorch-1.7.0_disable-dev-shm-test.patch b/Golden_Repo/p/PyTorch/PyTorch-1.7.0_disable-dev-shm-test.patch new file mode 100644 index 0000000000000000000000000000000000000000..0f85648a929d352009726aa01f2de9efcaef8378 --- /dev/null +++ b/Golden_Repo/p/PyTorch/PyTorch-1.7.0_disable-dev-shm-test.patch @@ -0,0 +1,21 @@ +This test fails randomly. I assume some concurrent test runners or another race condition +Should be safe even if this fails +See https://github.com/pytorch/pytorch/issues/48579 + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/test_multiprocessing.py b/test/test_multiprocessing.py +index 49e0a3cb45..e4e5b64ca1 100644 +--- a/test/test_multiprocessing.py ++++ b/test/test_multiprocessing.py +@@ -202,7 +202,9 @@ class leak_checker(object): + # available_fds = self._get_next_fds(10) + # self.test_case.assertLessEqual( + # available_fds[-1] - self.next_fds[-1], 5) +- self.test_case.assertFalse(self.has_shm_files()) ++ # self.test_case.assertFalse(self.has_shm_files()) ++ if self.has_shm_files(): ++ print("WARNING: has_shm_files test would have failed!") + return False + + def check_pid(self, pid): diff --git a/Golden_Repo/p/PyTorch/PyTorch-1.8.1_dont-use-gpu-ccc-in-test.patch b/Golden_Repo/p/PyTorch/PyTorch-1.8.1_dont-use-gpu-ccc-in-test.patch new file mode 100644 index 0000000000000000000000000000000000000000..a5c0731da64c062085178f344d2212ec4922339f --- /dev/null +++ b/Golden_Repo/p/PyTorch/PyTorch-1.8.1_dont-use-gpu-ccc-in-test.patch @@ -0,0 +1,22 @@ +Disable a part of a test which uses the current GPUs CUDA compute capability +This will fail if the GPU is newer than what nvcc supports. +See https://github.com/pytorch/pytorch/issues/51950 + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/test_cpp_extensions_jit.py b/test/test_cpp_extensions_jit.py +index efda7cb2cf..64607346c8 100644 +--- a/test/test_cpp_extensions_jit.py ++++ b/test/test_cpp_extensions_jit.py +@@ -181,11 +181,9 @@ class TestCppExtensionJIT(common.TestCase): + # - With/without '+PTX' + + n = torch.cuda.device_count() +- capabilities = {torch.cuda.get_device_capability(i) for i in range(n)} + # expected values is length-2 tuple: (list of ELF, list of PTX) + # note: there should not be more than one PTX value + archflags = { +- '': (['{}{}'.format(capability[0], capability[1]) for capability in capabilities], None), + "Maxwell+Tegra;6.1": (['53', '61'], None), + "Pascal 3.5": (['35', '60', '61'], None), + "Volta": (['70'], ['70']), diff --git a/Golden_Repo/p/PyTorch/PyTorch-1.9.0_limit-world-size-for-zero-redundancy-opt-test.patch b/Golden_Repo/p/PyTorch/PyTorch-1.9.0_limit-world-size-for-zero-redundancy-opt-test.patch new file mode 100644 index 0000000000000000000000000000000000000000..c360a459f01d189049e71c7c7a2dfc19de2bab8a --- /dev/null +++ b/Golden_Repo/p/PyTorch/PyTorch-1.9.0_limit-world-size-for-zero-redundancy-opt-test.patch @@ -0,0 +1,20 @@ +Some tests fail when run with anything but 2 GPUs and others when run with anything but 2 or 4 GPUs. +So limit to 2 GPUs. + +See https://github.com/pytorch/pytorch/issues/59548 + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/distributed/optim/test_zero_redundancy_optimizer.py b/test/distributed/optim/test_zero_redundancy_optimizer.py +index 06f1b4f484..bc82f6c304 100644 +--- a/test/distributed/optim/test_zero_redundancy_optimizer.py ++++ b/test/distributed/optim/test_zero_redundancy_optimizer.py +@@ -233,7 +233,7 @@ class TestZeroRedundancyOptimizerSingleRank(TestZeroRedundancyOptimizer): + class TestZeroRedundancyOptimizerDistributed(TestZeroRedundancyOptimizer): + @property + def world_size(self): +- return min(4, max(2, torch.cuda.device_count())) ++ return 2 + + @skip_if_rocm + def test_step(self):