Skip to content
Snippets Groups Projects
Commit f61a6569 authored by Alexandre Strube's avatar Alexandre Strube
Browse files

PyTorch and Pillow-SIMD and matplot with new pillow-simd

parent 86bc06c1
Branches
Tags
No related merge requests found
Showing with 538 additions and 1 deletion
...@@ -21,7 +21,7 @@ dependencies = [ ...@@ -21,7 +21,7 @@ dependencies = [
('libpng', '1.6.37'), ('libpng', '1.6.37'),
('freetype', '2.11.0'), ('freetype', '2.11.0'),
('Tkinter', '%(pyver)s'), ('Tkinter', '%(pyver)s'),
('Pillow-SIMD', '8.3.1'), ('Pillow-SIMD', '9.0.1'),
('Qhull', '2020.2') ('Qhull', '2020.2')
] ]
......
easyblock = 'PythonPackage'
name = 'Pillow-SIMD'
version = '9.0.1'
homepage = 'https://github.com/uploadcare/pillow-simd'
description = """Pillow is the 'friendly PIL fork' by Alex Clark and Contributors.
PIL is the Python Imaging Library by Fredrik Lundh and Contributors."""
toolchain = {'name': 'GCCcore', 'version': '11.2.0'}
source_urls = ['https://github.com/uploadcare/pillow-simd/archive/']
sources = ['%(version)s.tar.gz']
builddependencies = [('binutils', '2.37')]
dependencies = [
('Python', '3.9.6'),
('libjpeg-turbo', '2.1.1'),
('libpng', '1.6.37'),
('zlib', '1.2.11'),
('LibTIFF', '4.3.0'),
('freetype', '2.11.0')
]
use_pip = True
download_dep_fail = True
options = {'modulename': 'PIL'}
sanity_check_paths = {
'files': [],
'dirs': ['lib/python%(pyshortver)s/site-packages/PIL'],
}
sanity_pip_check = True
moduleclass = 'vis'
# Author: Alexander Grund
# Avoid test failures in CGROUP environments
# See https://github.com/pytorch/pytorch/issues/44368 and https://github.com/pytorch/pytorch/pull/44369
diff -Nru pytorch.orig/test/test_dataloader.py pytorch/test/test_dataloader.py
--- pytorch.orig/test/test_dataloader.py 2021-10-28 19:19:23.284526686 +0200
+++ pytorch/test/test_dataloader.py 2021-10-28 19:21:31.860488973 +0200
@@ -2374,22 +2374,27 @@
after = os.sched_getaffinity(0)
return iter(after)
-
-def worker_set_affinity(_):
- os.sched_setaffinity(0, [multiprocessing.cpu_count() - 1])
-
-
@unittest.skipIf(
not hasattr(os, 'sched_setaffinity'),
"os.sched_setaffinity is not available")
class TestSetAffinity(TestCase):
def test_set_affinity_in_worker_init(self):
+ # Query the current affinity mask to avoid setting a disallowed one
+ old_affinity = os.sched_getaffinity(0)
+ if not old_affinity:
+ self.skipTest("No affinity information")
+ # Choose any
+ expected_affinity = list(old_affinity)[-1]
+
+ def worker_set_affinity(_):
+ os.sched_setaffinity(0, [expected_affinity])
+
dataset = SetAffinityDataset()
dataloader = torch.utils.data.DataLoader(
dataset, num_workers=2, worker_init_fn=worker_set_affinity)
for sample in dataloader:
- self.assertEqual(sample, [multiprocessing.cpu_count() - 1])
+ self.assertEqual(sample, [expected_affinity])
class ConvDataset(Dataset):
def __init__(self):
# Author: Caspar van Leeuwen
# PyTorch's CMAKE configuration by default sets RUNPATH on libraries if they link other libraries
# that are outside the build tree, which is done because of the CMAKE config on
# https://github.com/pytorch/pytorch/blob/v1.10.0/cmake/Dependencies.cmake#L10.
# This provides problems, since the cuda stubs library path then also gets added to the RUNPATH.
# As a result, at runtime, the stub version of things like libcuda.so.1 gets picked up, instead of the real drivers
# See https://github.com/easybuilders/easybuild-easyconfigs/issues/14359
# This line https://github.com/pytorch/pytorch/blob/v1.10.0/cmake/Dependencies.cmake#L16
# Makes sure that any path that is linked, is also added to the RUNPATH.
# This has been reported upstream in https://github.com/pytorch/pytorch/issues/35418
# and a fix was attempted in https://github.com/pytorch/pytorch/pull/37737 but it was reverted
#
# This EasyBuild patch changes behavior for the libraries that were failing, i.e. the ones in this list:
# https://github.com/easybuilders/easybuild-easyconfigs/issues/14359#issuecomment-970479904
# This is done by setting INSTALL_RPATH_USE_LINK_PATH to false, and instead, specifying the RPATH
# explicitely by defining INSTALL_RPATH, but only adding directories that do not match to the "stubs" regex
# It has been upstreamed in this PR https://github.com/pytorch/pytorch/pull/68912 (not accepted yet at the time of writing)
diff -Nru pytorch.orig/caffe2/CMakeLists.txt pytorch/caffe2/CMakeLists.txt
--- pytorch.orig/caffe2/CMakeLists.txt 2021-11-17 11:46:01.797337624 +0100
+++ pytorch/caffe2/CMakeLists.txt 2021-11-18 19:05:35.637707235 +0100
@@ -630,8 +630,33 @@
else()
set(DELAY_LOAD_FLAGS "")
endif()
- target_link_libraries(caffe2_nvrtc ${CUDA_NVRTC} ${CUDA_CUDA_LIB} ${CUDA_NVRTC_LIB} ${DELAY_LOAD_FLAGS})
+ # message("CUDA_NVRTC: ${CUDA_NVRTC}")
+ # message("CUDA_NVRTC_LIB: ${CUDA_NVRTC_LIB}")
+ # message("CUDA_CUDA_LIB: ${CUDA_CUDA_LIB}")
+ # message("DELAY_LOAD_FLAGS: ${DELAY_LOAD_FLAGS}")
+ # if(CUDA_CUDA_LIB MATCHES "stubs")
+ # message("stubs libraries found in the CUDA_CUDA_LIB: ${CUDA_CUDA_LIB}")
+ # else()
+ # message("Stubs libs not found in CUDA_CUDA_LIB: ${CUDA_CUDA_LIB}")
+ # endif()
+ # Make sure the CUDA stubs folder doesn't end up in the RPATH of CAFFE2_NVRTC:
+ set(CAFFE2_NVRTC_LIBS ${CUDA_NVRTC} ${CUDA_CUDA_LIB} ${CUDA_NVRTC_LIB})
+ foreach(LIB IN LISTS CAFFE2_NVRTC_LIBS)
+ message("LIB: ${LIB}")
+ if(LIB MATCHES "stubs")
+ message("Filtering ${LIB} from being set in caffe2_nvrtc's RPATH, because it appears to point to the CUDA stubs directory, which should not be RPATHed.")
+ else()
+ cmake_path(GET LIB PARENT_PATH LIB_PATH)
+ message("LIBPATH: ${LIB_PATH}")
+ list(APPEND CAFFE2_NVRTC_RPATH ${LIB_PATH})
+ endif()
+ endforeach()
+ message("CAFFE2_NVRTC_RPATH: ${CAFFE2_NVRTC_RPATH}")
+ set_target_properties(caffe2_nvrtc PROPERTIES INSTALL_RPATH_USE_LINK_PATH FALSE)
+ set_target_properties(caffe2_nvrtc PROPERTIES INSTALL_RPATH "${CAFFE2_NVRTC_RPATH}")
+ target_link_libraries(caffe2_nvrtc ${CAFFE2_NVRTC_LIBS} ${DELAY_LOAD_FLAGS})
target_include_directories(caffe2_nvrtc PRIVATE ${CUDA_INCLUDE_DIRS})
+# message(FATAL_ERROR "STOP HERE, we're debugging")
install(TARGETS caffe2_nvrtc DESTINATION "${TORCH_INSTALL_LIB_DIR}")
if(USE_NCCL AND BUILD_SPLIT_CUDA)
list(APPEND Caffe2_GPU_SRCS_CPP
diff -Nru pytorch.orig/test/cpp/api/CMakeLists.txt pytorch/test/cpp/api/CMakeLists.txt
--- pytorch.orig/test/cpp/api/CMakeLists.txt 2021-11-17 11:46:02.991350652 +0100
+++ pytorch/test/cpp/api/CMakeLists.txt 2021-11-18 19:06:41.207423777 +0100
@@ -61,6 +61,22 @@
${CUDA_CUDA_LIB}
${TORCH_CUDA_LIBRARIES})
+ # Make sure the CUDA stubs folder doesn't end up in the RPATH of test_api:
+ set(TEST_API_LIBS ${CUDA_LIBRARIES} ${CUDA_NVRTC_LIB} ${CUDA_CUDA_LIB} ${TORCH_CUDA_LIBRARIES})
+ foreach(LIB IN LISTS TEST_API_LIBS)
+ message("LIB: ${LIB}")
+ if(LIB MATCHES "stubs")
+ message("Filtering ${LIB} from being set in caffe2_nvrtc's RPATH, because it appears to point to the CUDA stubs directory, which should not be RPATHed.")
+ else()
+ cmake_path(GET LIB PARENT_PATH LIB_PATH)
+ message("LIBPATH: ${LIB_PATH}")
+ list(APPEND TEST_API_RPATH ${LIB_PATH})
+ endif()
+ endforeach()
+ message("TEST_API_RPATH: ${TEST_API_RPATH}")
+ set_target_properties(test_api PROPERTIES INSTALL_RPATH_USE_LINK_PATH FALSE)
+ set_target_properties(test_api PROPERTIES INSTALL_RPATH "${TEST_API_RPATH}")
+
target_compile_definitions(test_api PRIVATE "USE_CUDA")
endif()
diff -Nru pytorch.orig/test/cpp/dist_autograd/CMakeLists.txt pytorch/test/cpp/dist_autograd/CMakeLists.txt
--- pytorch.orig/test/cpp/dist_autograd/CMakeLists.txt 2021-11-17 11:46:02.993350674 +0100
+++ pytorch/test/cpp/dist_autograd/CMakeLists.txt 2021-11-18 19:06:18.389174421 +0100
@@ -16,6 +16,22 @@
${CUDA_CUDA_LIB}
${TORCH_CUDA_LIBRARIES})
+ # Make sure the CUDA stubs folder doesn't end up in the RPATH of test_dist_autograd:
+ set(DIST_AUTOGRAD_LIBS ${CUDA_LIBRARIES} ${CUDA_NVRTC_LIB} ${CUDA_CUDA_LIB} ${TORCH_CUDA_LIBRARIES})
+ foreach(LIB IN LISTS DIST_AUTOGRAD_LIBS)
+ message("LIB: ${LIB}")
+ if(LIB MATCHES "stubs")
+ message("Filtering ${LIB} from being set in caffe2_nvrtc's RPATH, because it appears to point to the CUDA stubs directory, which should not be RPATHed.")
+ else()
+ cmake_path(GET LIB PARENT_PATH LIB_PATH)
+ message("LIBPATH: ${LIB_PATH}")
+ list(APPEND DIST_AUTOGRAD_RPATH ${LIB_PATH})
+ endif()
+ endforeach()
+ message("DIST_AUTOGRAD_RPATH: ${DIST_AUTOGRAD_RPATH}")
+ set_target_properties(test_dist_autograd PROPERTIES INSTALL_RPATH_USE_LINK_PATH FALSE)
+ set_target_properties(test_dist_autograd PROPERTIES INSTALL_RPATH "${DIST_AUTOGRAD_RPATH}")
+
target_compile_definitions(test_dist_autograd PRIVATE "USE_CUDA")
endif()
diff -Nru pytorch.orig/test/cpp/jit/CMakeLists.txt pytorch/test/cpp/jit/CMakeLists.txt
--- pytorch.orig/test/cpp/jit/CMakeLists.txt 2021-11-17 11:46:02.989350630 +0100
+++ pytorch/test/cpp/jit/CMakeLists.txt 2021-11-18 19:05:41.396770168 +0100
@@ -94,6 +94,7 @@
list(APPEND JIT_TEST_DEPENDENCIES onnx_library)
endif(MSVC)
+
target_link_libraries(test_jit PRIVATE ${JIT_TEST_DEPENDENCIES})
target_include_directories(test_jit PRIVATE ${ATen_CPU_INCLUDE})
@@ -109,6 +110,22 @@
${CUDA_CUDA_LIB}
${TORCH_CUDA_LIBRARIES})
+ # Make sure the CUDA stubs folder doesn't end up in the RPATH of test_jit:
+ set(TEST_JIT_LIBS ${CUDA_LIBRARIES} ${CUDA_NVRTC_LIB} ${CUDA_CUDA_LIB} ${TORCH_CUDA_LIBRARIES})
+ foreach(LIB IN LISTS TEST_JIT_LIBS)
+ message("LIB: ${LIB}")
+ if(LIB MATCHES "stubs")
+ message("Filtering ${LIB} from being set in test_jit's RPATH, because it appears to point to the CUDA stubs directory, which should not be RPATHed.")
+ else()
+ cmake_path(GET LIB PARENT_PATH LIB_PATH)
+ message("LIBPATH: ${LIB_PATH}")
+ list(APPEND TEST_JIT_RPATH ${LIB_PATH})
+ endif()
+ endforeach()
+ message("TEST_JIT_RPATH: ${TEST_JIT_RPATH}")
+ set_target_properties(test_jit PROPERTIES INSTALL_RPATH_USE_LINK_PATH FALSE)
+ set_target_properties(test_jit PROPERTIES INSTALL_RPATH "${TEST_JIT_RPATH}")
+
target_compile_definitions(test_jit PRIVATE USE_CUDA)
elseif(USE_ROCM)
target_link_libraries(test_jit PRIVATE
diff -Nru pytorch.orig/test/cpp/rpc/CMakeLists.txt pytorch/test/cpp/rpc/CMakeLists.txt
--- pytorch.orig/test/cpp/rpc/CMakeLists.txt 2021-11-17 11:46:02.991350652 +0100
+++ pytorch/test/cpp/rpc/CMakeLists.txt 2021-11-18 19:06:30.502306793 +0100
@@ -39,6 +39,22 @@
${CUDA_CUDA_LIB}
${TORCH_CUDA_LIBRARIES})
+ # Make sure the CUDA stubs folder doesn't end up in the RPATH of test_cpp_rpc:
+ set(CPP_RPC_LIBS ${CUDA_LIBRARIES} ${CUDA_NVRTC_LIB} ${CUDA_CUDA_LIB} ${TORCH_CUDA_LIBRARIES})
+ foreach(LIB IN LISTS CPP_RPC_LIBS)
+ message("LIB: ${LIB}")
+ if(LIB MATCHES "stubs")
+ message("Filtering ${LIB} from being set in caffe2_nvrtc's RPATH, because it appears to point to the CUDA stubs directory, which should not be RPATHed.")
+ else()
+ cmake_path(GET LIB PARENT_PATH LIB_PATH)
+ message("LIBPATH: ${LIB_PATH}")
+ list(APPEND CPP_RPC_RPATH ${LIB_PATH})
+ endif()
+ endforeach()
+ message("CPP_RPC_RPATH: ${CPP_RPC_RPATH}")
+ set_target_properties(test_cpp_rpc PROPERTIES INSTALL_RPATH_USE_LINK_PATH FALSE)
+ set_target_properties(test_cpp_rpc PROPERTIES INSTALL_RPATH "${CPP_RPC_RPATH}")
+
target_compile_definitions(test_cpp_rpc PRIVATE "USE_CUDA")
endif()
diff -Nru pytorch.orig/test/cpp/tensorexpr/CMakeLists.txt pytorch/test/cpp/tensorexpr/CMakeLists.txt
--- pytorch.orig/test/cpp/tensorexpr/CMakeLists.txt 2021-11-17 11:46:02.993350674 +0100
+++ pytorch/test/cpp/tensorexpr/CMakeLists.txt 2021-11-18 19:06:00.988984273 +0100
@@ -62,6 +62,24 @@
${CUDA_CUDA_LIB}
${TORCH_CUDA_LIBRARIES})
target_compile_definitions(tutorial_tensorexpr PRIVATE USE_CUDA)
+
+ # Make sure the CUDA stubs folder doesn't end up in the RPATH of tutorial_tensorexpr:
+ set(CUDA_LINK_LIBS ${CUDA_LIBRARIES} ${CUDA_NVRTC_LIB} ${CUDA_CUDA_LIB} ${TORCH_CUDA_LIBRARIES})
+ foreach(LIB IN LISTS CUDA_LINK_LIBS)
+ message("LIB: ${LIB}")
+ if(LIB MATCHES "stubs")
+ message("Filtering ${LIB} from being set in test_tensorexpr and tutorial_tensorexpr RPATH, because it appears to point to the CUDA stubs directory, which should not be RPATHed.")
+ else()
+ cmake_path(GET LIB PARENT_PATH LIB_PATH)
+ message("LIBPATH: ${LIB_PATH}")
+ list(APPEND TENSOREXPR_RPATH ${LIB_PATH})
+ endif()
+ endforeach()
+ message("TENSOREXPR_RPATH: ${TENSOREXPR_RPATH}")
+ set_target_properties(test_tensorexpr PROPERTIES INSTALL_RPATH_USE_LINK_PATH FALSE)
+ set_target_properties(test_tensorexpr PROPERTIES INSTALL_RPATH "${TENSOREXPR_RPATH}")
+ set_target_properties(tutorial_tensorexpr PROPERTIES INSTALL_RPATH_USE_LINK_PATH FALSE)
+ set_target_properties(tutorial_tensorexpr PROPERTIES INSTALL_RPATH "${TENSOREXPR_RPATH}")
elseif(USE_ROCM)
target_link_libraries(test_tensorexpr PRIVATE
${ROCM_HIPRTC_LIB}
name = 'PyTorch'
version = '1.11'
versionsuffix = '-CUDA-%(cudaver)s'
homepage = 'https://pytorch.org/'
description = """Tensors and Dynamic neural networks in Python with strong GPU acceleration.
PyTorch is a deep learning framework that puts Python first."""
toolchain = {'name': 'gcccoremkl', 'version': '11.2.0-2021.4.0'}
toolchainopts = {'openmp': True}
# toolchainopts = {'cstd': 'c++11'}
sources = [{
'filename': '%(name)s-%(version)s.tar.gz',
'git_config': {
'url': 'https://github.com/pytorch',
'repo_name': 'pytorch',
'tag': 'v1.11.0',
'recursive': True,
},
}]
patches = [
'PyTorch-1.7.0_avoid-nan-in-test-torch.patch',
'PyTorch-1.7.0_disable-dev-shm-test.patch',
# 'PyTorch-1.7.1_correctly-pass-jit_opt_level.patch',
'PyTorch-1.8.1_dont-use-gpu-ccc-in-test.patch',
# 'PyTorch-1.8.1_increase-distributed-test-timeout.patch',
'PyTorch-1.9.0_limit-world-size-for-zero-redundancy-opt-test.patch',
'PyTorch-1.10.0_fix-test-dataloader-fixed-affinity.patch',
# 'PyTorch-1.10.0_fix-alias-violation-in-bitwise-ops.patch',
# 'PyTorch-1.10.0_fix-faulty-asserts-and-skip-test.patch',
# 'PyTorch-1.10.0_fix-test-cond-cpu.patch',
# 'PyTorch-1.10.0_fix-vnni-detection.patch',
# 'PyTorch-1.10.0_increase_zero_optimizer_test_tolerance.patch',
# 'PyTorch-1.10.0_skip_failing_ops_tests.patch',
# 'PyTorch-1.10.0_skip_nan_tests_openblas.patch',
'PyTorch-1.10.0_skip_cmake_rpath.patch',
# 'PyTorch-1.10.0_fix-gcc11-ideep.patch',
# 'PyTorch-1.10.0_fix_gcc11_nullpointer.patch',
# 'cub-lint.yaml.patch',
# 'cub-cub.cuh.patch',
#('cub-cub-definitions.patch', 1),
# 'cub-context_gpu.patch',
# 'cub-accuracy_op.patch',
# 'cub-affine-channel_op.patch',
# 'cub-arg_ops.patch',
# 'cub-batch_moments_op.patch',
# 'cub-batch_sparse_to_dense_op.patch',
# 'cub-boolean_mask_ops.patch',
# 'cub-cross_entropy.patch',
# 'cub-distance_op.patch',
# 'cub-elementwise_div_op.patch',
# 'cub-elementwise_linear_op.patch',
# 'cub-elementwise_mul_op.patch',
# 'cub-elementwise_ops.patch',
# 'cub-find_op.patch',
# 'cub-generate_proposals_op.patch',
# 'cub-normalize_ops.patch',
# 'cub-one_hot_ops.patch',
# 'cub-pack_segments.patch',
# 'cub-prelu_op.patch',
# 'cub-reduce_front_back_max_ops.patch',
# 'cub-reduce_front_back_sum_mean_ops.patch',
# 'cub-reduction_ops.patch',
# 'cub-rmac_regions_op.patch',
# 'cub-segment_reduction_op_gpu.patch',
# 'cub-sequence_ops.patch',
# 'cub-softmax_ops.patch',
# 'cub-spatial_batch_norm_op_impl.patch',
# 'cub-adagrad_fused_op_gpu.patch',
# 'cub-adagrad_op_gpu.patch',
# 'cub-adam_op_gpu.patch',
#('cub-cub_namespace.patch', 1),
# 'cub-reduce.patch',
# 'cub-math-gpu.patch',
# 'cub-CMake-Dependencies.patch',
]
osdependencies = [OS_PKG_IBVERBS_DEV]
builddependencies = [
('CMake', '3.21.1'),
('hypothesis', '6.14.6'),
]
dependencies = [
('CUDA', '11.5', '', True),
('Ninja', '1.10.2'), # Required for JIT compilation of C++ extensions
('Python', '3.9.6'),
('protobuf', '3.17.3'),
('protobuf-python', '3.17.3'),
('pybind11', '2.7.1'),
('SciPy-bundle', '2021.10'),
('typing-extensions', '3.10.0.0'),
('PyYAML', '5.4.1'),
('MPFR', '4.1.0'),
('GMP', '6.2.1'),
('numactl', '2.0.14', '', SYSTEM),
('FFmpeg', '4.4.1'),
('Pillow-SIMD', '9.0.1'),
('cuDNN', '8.3.1.22', '-CUDA-%(cudaver)s', True),
('magma', '2.6.1', '-CUDA-%(cudaver)s'),
('NCCL', '2.11.4', '-CUDA-%(cudaver)s'),
('expecttest', '0.1.3'),
]
# default CUDA compute capabilities to use (override via --cuda-compute-capabilities)
cuda_compute_capabilities = ['6.0', '6.1', '7.0', '7.2', '7.5', '8.0', '8.6']
custom_opts = ["USE_CUPTI_SO=1"]
configopts = 'MKL_THREADING_LAYER=sequential CFLAGS="$CFLAGS -fopenmp" CXXFLAGS="$CXXFLAGS -fopenmp" LDFLAGS=-fopenmp'
excluded_tests = {
'': [
# Bad tests: https://github.com/pytorch/pytorch/issues/60260
'distributed/elastic/utils/distributed_test',
'distributed/elastic/multiprocessing/api_test',
# These tests fail on A10s at the very least, they time out forever no matter how long the timeout is.
# Possibly related to NCCL 2.8.3: https://docs.nvidia.com/deeplearning/nccl/release-notes/rel_2-8-3.html
# 'distributed/test_distributed_fork',
'distributed/test_distributed_spawn',
# Fails on A10s: https://github.com/pytorch/pytorch/issues/63079
'test_optim',
# Test from this suite timeout often. The process group backend is deprecated anyway
# 'distributed/rpc/test_process_group_agent',
'test_jit',
'test_jit_cuda_fuser',
'test_jit_legacy',
'test_jit_profiling',
'test_xnnpack_integration',
'distributed/_shard/sharded_optim/test_sharded_optim',
'distributed/_shard/sharded_tensor/ops/test_linear',
'distributed/_shard/sharded_tensor/test_megatron_prototype',
'distributions/test_distributions',
'test_cpp_extensions_jit',
'distributed/rpc/test_tensorpipe_agent',
'test_ops',
'distributed/fsdp/test_fsdp_memory', # fails on hdfml
'distributed/fsdp/test_fsdp_overlap', # fails on hdfml
'test_autograd', # fails on jureca dc and deep
'test_cuda', # fails on jureca dc
'test_multiprocessing', # fails on jureca dc
'test_nn', # fails on jureca dc
'test_profiler', # fails on jureca dc
'test_quantization', # fails on jureca dc
'distributed/_shard/sharded_tensor/test_sharded_tensor', # fails on deep
'distributed/algorithms/test_join', # fails on deep and jureca dc
'distributed/fsdp/test_fsdp_checkpoint', # fails on deep and jureca dc
'distributed/fsdp/test_fsdp_core', # fails on deep and jureca dc
'distributed/fsdp/test_fsdp_freezing_weights', # fails on deep and jureca dc
'distributed/fsdp/test_fsdp_memory', # fails on deep
'distributed/fsdp/test_fsdp_multiple_forward', # fails on deep and jureca dc
'distributed/fsdp/test_fsdp_multiple_wrapping', # fails on deep and jureca dc
'distributed/fsdp/test_fsdp_overlap', # fails on deep
'distributed/fsdp/test_fsdp_pure_fp16', # fails on deep and jureca dc
'distributed/fsdp/test_fsdp_uneven', # fails on deep and jureca dc
'distributed/fsdp/test_wrap', # fails on deep and jureca dc
'distributed/optim/test_zero_redundancy_optimizer', # fails on deep and jureca dc
'distributed/rpc/cuda/test_tensorpipe_agent', # fails on deep
'distributed/rpc/test_faulty_agent', # fails on deep
'distributed/test_c10d_gloo', # fails on deep
'test_model_dump', # fails on deep
'distributed/test_c10d_nccl', # fails on jureca dc
'distributed/test_c10d_spawn_nccl', # fails on jureca dc
'distributed/test_data_parallel', # fails on jureca dc
]
}
runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --continue-through-error --verbose %(excluded_tests)s'
# The readelf sanity check can be taken out once the TestRPATH test from https://github.com/pytorch/pytorch/pull/68912
# is accepted, since it is then checked as part of the PyTorch test suite
local_libcaffe2 = "$EBROOTPYTORCH/lib/python%%(pyshortver)s/site-packages/torch/lib/libcaffe2_nvrtc.%s" % SHLIB_EXT
sanity_check_commands = [
"python -c 'import torch'",
"readelf -d %s | egrep 'RPATH|RUNPATH' | grep -v stubs" % local_libcaffe2,
]
tests = ['PyTorch-check-cpp-extension.py']
moduleclass = 'devel'
This test uses in-place operations which may generate NaNs making subsequent tests fail
See https://github.com/pytorch/pytorch/issues/48591
Author: Alexander Grund (TU Dresden)
diff --git a/test/test_torch.py b/test/test_torch.py
index 1f3f568f7b..237fb030f6 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -15060,7 +15060,7 @@ class TestTorchDeviceType(TestCase):
x_c = x.contiguous()
y_c = y.contiguous()
result_c = fn(x_c, y_c)
- result = fn(x, y)
+ result = fn(x.clone(), y)
self.assertEqual(result, result_c)
self.assertTrue(
result.is_contiguous(memory_format=memory_format),
This test fails randomly. I assume some concurrent test runners or another race condition
Should be safe even if this fails
See https://github.com/pytorch/pytorch/issues/48579
Author: Alexander Grund (TU Dresden)
diff --git a/test/test_multiprocessing.py b/test/test_multiprocessing.py
index 49e0a3cb45..e4e5b64ca1 100644
--- a/test/test_multiprocessing.py
+++ b/test/test_multiprocessing.py
@@ -202,7 +202,9 @@ class leak_checker(object):
# available_fds = self._get_next_fds(10)
# self.test_case.assertLessEqual(
# available_fds[-1] - self.next_fds[-1], 5)
- self.test_case.assertFalse(self.has_shm_files())
+ # self.test_case.assertFalse(self.has_shm_files())
+ if self.has_shm_files():
+ print("WARNING: has_shm_files test would have failed!")
return False
def check_pid(self, pid):
Disable a part of a test which uses the current GPUs CUDA compute capability
This will fail if the GPU is newer than what nvcc supports.
See https://github.com/pytorch/pytorch/issues/51950
Author: Alexander Grund (TU Dresden)
diff --git a/test/test_cpp_extensions_jit.py b/test/test_cpp_extensions_jit.py
index efda7cb2cf..64607346c8 100644
--- a/test/test_cpp_extensions_jit.py
+++ b/test/test_cpp_extensions_jit.py
@@ -181,11 +181,9 @@ class TestCppExtensionJIT(common.TestCase):
# - With/without '+PTX'
n = torch.cuda.device_count()
- capabilities = {torch.cuda.get_device_capability(i) for i in range(n)}
# expected values is length-2 tuple: (list of ELF, list of PTX)
# note: there should not be more than one PTX value
archflags = {
- '': (['{}{}'.format(capability[0], capability[1]) for capability in capabilities], None),
"Maxwell+Tegra;6.1": (['53', '61'], None),
"Pascal 3.5": (['35', '60', '61'], None),
"Volta": (['70'], ['70']),
Some tests fail when run with anything but 2 GPUs and others when run with anything but 2 or 4 GPUs.
So limit to 2 GPUs.
See https://github.com/pytorch/pytorch/issues/59548
Author: Alexander Grund (TU Dresden)
diff --git a/test/distributed/optim/test_zero_redundancy_optimizer.py b/test/distributed/optim/test_zero_redundancy_optimizer.py
index 06f1b4f484..bc82f6c304 100644
--- a/test/distributed/optim/test_zero_redundancy_optimizer.py
+++ b/test/distributed/optim/test_zero_redundancy_optimizer.py
@@ -233,7 +233,7 @@ class TestZeroRedundancyOptimizerSingleRank(TestZeroRedundancyOptimizer):
class TestZeroRedundancyOptimizerDistributed(TestZeroRedundancyOptimizer):
@property
def world_size(self):
- return min(4, max(2, torch.cuda.device_count()))
+ return 2
@skip_if_rocm
def test_step(self):
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment