Commit 7d38bb05 authored by Stepan Nassyr's avatar Stepan Nassyr
Browse files

Add PyTorch 1.11.0

parent f0d5938d
easyblock = 'PythonPackage'
name = 'expecttest'
version = '0.1.3'
versionsuffix = "-Python-%(pyver)s"
homepage = "https://github.com/ezyang/expecttest"
description = """This library implements expect tests (also known as "golden" tests). Expect tests are a method of
writing tests where instead of hard-coding the expected output of a test, you run the test to get the output, and
the test framework automatically populates the expected output. If the output of the test changes, you can rerun
the test with the environment variable EXPECTTEST_ACCEPT=1 to automatically update the expected output."""
toolchain = {'name': 'GCCcore', 'version': '11.1.0'}
sources = [SOURCE_TAR_GZ]
checksums = ['83057695811d94128aed13ed094a070db90e0a92ea40071f8ee073cbab57149a']
builddependencies = [('binutils', '2.36.1')]
dependencies = [('Python', '3.9.4')]
use_pip = True
download_dep_fail = True
sanity_pip_check = True
moduleclass = 'tools'
easyblock = 'PythonPackage'
name = 'Pillow-SIMD'
version = '9.0.1'
versionsuffix="-Python-%(pyver)s"
homepage = 'https://github.com/uploadcare/pillow-simd'
description = """Pillow is the 'friendly PIL fork' by Alex Clark and Contributors.
PIL is the Python Imaging Library by Fredrik Lundh and Contributors."""
toolchain = {'name': 'GCCcore', 'version': '11.1.0'}
source_urls = ['https://github.com/uploadcare/pillow-simd/archive/']
sources = ['%(version)s.tar.gz']
checksums = ['4f91ab5ede15bfc71075941b62a7db3eee337fe810588a57e3c0dc103ac1bb45']
builddependencies = [('binutils', '2.36.1')]
dependencies = [
('Python', '3.9.4'),
('libjpeg-turbo', '2.0.6'),
('libpng', '1.6.37'),
('zlib', '1.2.11'),
('LibTIFF', '4.2.0'),
('freetype', '2.10.4')
]
use_pip = True
download_dep_fail = True
options = {'modulename': 'PIL'}
sanity_check_paths = {
'files': [],
'dirs': ['lib/python%(pyshortver)s/site-packages/PIL'],
}
sanity_pip_check = True
moduleclass = 'vis'
# Author: Alexander Grund
# Avoid test failures in CGROUP environments
# See https://github.com/pytorch/pytorch/issues/44368 and https://github.com/pytorch/pytorch/pull/44369
diff -Nru pytorch.orig/test/test_dataloader.py pytorch/test/test_dataloader.py
--- pytorch.orig/test/test_dataloader.py 2021-10-28 19:19:23.284526686 +0200
+++ pytorch/test/test_dataloader.py 2021-10-28 19:21:31.860488973 +0200
@@ -2374,22 +2374,27 @@
after = os.sched_getaffinity(0)
return iter(after)
-
-def worker_set_affinity(_):
- os.sched_setaffinity(0, [multiprocessing.cpu_count() - 1])
-
-
@unittest.skipIf(
not hasattr(os, 'sched_setaffinity'),
"os.sched_setaffinity is not available")
class TestSetAffinity(TestCase):
def test_set_affinity_in_worker_init(self):
+ # Query the current affinity mask to avoid setting a disallowed one
+ old_affinity = os.sched_getaffinity(0)
+ if not old_affinity:
+ self.skipTest("No affinity information")
+ # Choose any
+ expected_affinity = list(old_affinity)[-1]
+
+ def worker_set_affinity(_):
+ os.sched_setaffinity(0, [expected_affinity])
+
dataset = SetAffinityDataset()
dataloader = torch.utils.data.DataLoader(
dataset, num_workers=2, worker_init_fn=worker_set_affinity)
for sample in dataloader:
- self.assertEqual(sample, [multiprocessing.cpu_count() - 1])
+ self.assertEqual(sample, [expected_affinity])
class ConvDataset(Dataset):
def __init__(self):
# Author: Caspar van Leeuwen
# PyTorch's CMAKE configuration by default sets RUNPATH on libraries if they link other libraries
# that are outside the build tree, which is done because of the CMAKE config on
# https://github.com/pytorch/pytorch/blob/v1.10.0/cmake/Dependencies.cmake#L10.
# This provides problems, since the cuda stubs library path then also gets added to the RUNPATH.
# As a result, at runtime, the stub version of things like libcuda.so.1 gets picked up, instead of the real drivers
# See https://github.com/easybuilders/easybuild-easyconfigs/issues/14359
# This line https://github.com/pytorch/pytorch/blob/v1.10.0/cmake/Dependencies.cmake#L16
# Makes sure that any path that is linked, is also added to the RUNPATH.
# This has been reported upstream in https://github.com/pytorch/pytorch/issues/35418
# and a fix was attempted in https://github.com/pytorch/pytorch/pull/37737 but it was reverted
#
# This EasyBuild patch changes behavior for the libraries that were failing, i.e. the ones in this list:
# https://github.com/easybuilders/easybuild-easyconfigs/issues/14359#issuecomment-970479904
# This is done by setting INSTALL_RPATH_USE_LINK_PATH to false, and instead, specifying the RPATH
# explicitely by defining INSTALL_RPATH, but only adding directories that do not match to the "stubs" regex
# It has been upstreamed in this PR https://github.com/pytorch/pytorch/pull/68912 (not accepted yet at the time of writing)
diff -Nru pytorch.orig/caffe2/CMakeLists.txt pytorch/caffe2/CMakeLists.txt
--- pytorch.orig/caffe2/CMakeLists.txt 2021-11-17 11:46:01.797337624 +0100
+++ pytorch/caffe2/CMakeLists.txt 2021-11-18 19:05:35.637707235 +0100
@@ -630,8 +630,33 @@
else()
set(DELAY_LOAD_FLAGS "")
endif()
- target_link_libraries(caffe2_nvrtc ${CUDA_NVRTC} ${CUDA_CUDA_LIB} ${CUDA_NVRTC_LIB} ${DELAY_LOAD_FLAGS})
+ # message("CUDA_NVRTC: ${CUDA_NVRTC}")
+ # message("CUDA_NVRTC_LIB: ${CUDA_NVRTC_LIB}")
+ # message("CUDA_CUDA_LIB: ${CUDA_CUDA_LIB}")
+ # message("DELAY_LOAD_FLAGS: ${DELAY_LOAD_FLAGS}")
+ # if(CUDA_CUDA_LIB MATCHES "stubs")
+ # message("stubs libraries found in the CUDA_CUDA_LIB: ${CUDA_CUDA_LIB}")
+ # else()
+ # message("Stubs libs not found in CUDA_CUDA_LIB: ${CUDA_CUDA_LIB}")
+ # endif()
+ # Make sure the CUDA stubs folder doesn't end up in the RPATH of CAFFE2_NVRTC:
+ set(CAFFE2_NVRTC_LIBS ${CUDA_NVRTC} ${CUDA_CUDA_LIB} ${CUDA_NVRTC_LIB})
+ foreach(LIB IN LISTS CAFFE2_NVRTC_LIBS)
+ message("LIB: ${LIB}")
+ if(LIB MATCHES "stubs")
+ message("Filtering ${LIB} from being set in caffe2_nvrtc's RPATH, because it appears to point to the CUDA stubs directory, which should not be RPATHed.")
+ else()
+ cmake_path(GET LIB PARENT_PATH LIB_PATH)
+ message("LIBPATH: ${LIB_PATH}")
+ list(APPEND CAFFE2_NVRTC_RPATH ${LIB_PATH})
+ endif()
+ endforeach()
+ message("CAFFE2_NVRTC_RPATH: ${CAFFE2_NVRTC_RPATH}")
+ set_target_properties(caffe2_nvrtc PROPERTIES INSTALL_RPATH_USE_LINK_PATH FALSE)
+ set_target_properties(caffe2_nvrtc PROPERTIES INSTALL_RPATH "${CAFFE2_NVRTC_RPATH}")
+ target_link_libraries(caffe2_nvrtc ${CAFFE2_NVRTC_LIBS} ${DELAY_LOAD_FLAGS})
target_include_directories(caffe2_nvrtc PRIVATE ${CUDA_INCLUDE_DIRS})
+# message(FATAL_ERROR "STOP HERE, we're debugging")
install(TARGETS caffe2_nvrtc DESTINATION "${TORCH_INSTALL_LIB_DIR}")
if(USE_NCCL AND BUILD_SPLIT_CUDA)
list(APPEND Caffe2_GPU_SRCS_CPP
diff -Nru pytorch.orig/test/cpp/api/CMakeLists.txt pytorch/test/cpp/api/CMakeLists.txt
--- pytorch.orig/test/cpp/api/CMakeLists.txt 2021-11-17 11:46:02.991350652 +0100
+++ pytorch/test/cpp/api/CMakeLists.txt 2021-11-18 19:06:41.207423777 +0100
@@ -61,6 +61,22 @@
${CUDA_CUDA_LIB}
${TORCH_CUDA_LIBRARIES})
+ # Make sure the CUDA stubs folder doesn't end up in the RPATH of test_api:
+ set(TEST_API_LIBS ${CUDA_LIBRARIES} ${CUDA_NVRTC_LIB} ${CUDA_CUDA_LIB} ${TORCH_CUDA_LIBRARIES})
+ foreach(LIB IN LISTS TEST_API_LIBS)
+ message("LIB: ${LIB}")
+ if(LIB MATCHES "stubs")
+ message("Filtering ${LIB} from being set in caffe2_nvrtc's RPATH, because it appears to point to the CUDA stubs directory, which should not be RPATHed.")
+ else()
+ cmake_path(GET LIB PARENT_PATH LIB_PATH)
+ message("LIBPATH: ${LIB_PATH}")
+ list(APPEND TEST_API_RPATH ${LIB_PATH})
+ endif()
+ endforeach()
+ message("TEST_API_RPATH: ${TEST_API_RPATH}")
+ set_target_properties(test_api PROPERTIES INSTALL_RPATH_USE_LINK_PATH FALSE)
+ set_target_properties(test_api PROPERTIES INSTALL_RPATH "${TEST_API_RPATH}")
+
target_compile_definitions(test_api PRIVATE "USE_CUDA")
endif()
diff -Nru pytorch.orig/test/cpp/dist_autograd/CMakeLists.txt pytorch/test/cpp/dist_autograd/CMakeLists.txt
--- pytorch.orig/test/cpp/dist_autograd/CMakeLists.txt 2021-11-17 11:46:02.993350674 +0100
+++ pytorch/test/cpp/dist_autograd/CMakeLists.txt 2021-11-18 19:06:18.389174421 +0100
@@ -16,6 +16,22 @@
${CUDA_CUDA_LIB}
${TORCH_CUDA_LIBRARIES})
+ # Make sure the CUDA stubs folder doesn't end up in the RPATH of test_dist_autograd:
+ set(DIST_AUTOGRAD_LIBS ${CUDA_LIBRARIES} ${CUDA_NVRTC_LIB} ${CUDA_CUDA_LIB} ${TORCH_CUDA_LIBRARIES})
+ foreach(LIB IN LISTS DIST_AUTOGRAD_LIBS)
+ message("LIB: ${LIB}")
+ if(LIB MATCHES "stubs")
+ message("Filtering ${LIB} from being set in caffe2_nvrtc's RPATH, because it appears to point to the CUDA stubs directory, which should not be RPATHed.")
+ else()
+ cmake_path(GET LIB PARENT_PATH LIB_PATH)
+ message("LIBPATH: ${LIB_PATH}")
+ list(APPEND DIST_AUTOGRAD_RPATH ${LIB_PATH})
+ endif()
+ endforeach()
+ message("DIST_AUTOGRAD_RPATH: ${DIST_AUTOGRAD_RPATH}")
+ set_target_properties(test_dist_autograd PROPERTIES INSTALL_RPATH_USE_LINK_PATH FALSE)
+ set_target_properties(test_dist_autograd PROPERTIES INSTALL_RPATH "${DIST_AUTOGRAD_RPATH}")
+
target_compile_definitions(test_dist_autograd PRIVATE "USE_CUDA")
endif()
diff -Nru pytorch.orig/test/cpp/jit/CMakeLists.txt pytorch/test/cpp/jit/CMakeLists.txt
--- pytorch.orig/test/cpp/jit/CMakeLists.txt 2021-11-17 11:46:02.989350630 +0100
+++ pytorch/test/cpp/jit/CMakeLists.txt 2021-11-18 19:05:41.396770168 +0100
@@ -94,6 +94,7 @@
list(APPEND JIT_TEST_DEPENDENCIES onnx_library)
endif(MSVC)
+
target_link_libraries(test_jit PRIVATE ${JIT_TEST_DEPENDENCIES})
target_include_directories(test_jit PRIVATE ${ATen_CPU_INCLUDE})
@@ -109,6 +110,22 @@
${CUDA_CUDA_LIB}
${TORCH_CUDA_LIBRARIES})
+ # Make sure the CUDA stubs folder doesn't end up in the RPATH of test_jit:
+ set(TEST_JIT_LIBS ${CUDA_LIBRARIES} ${CUDA_NVRTC_LIB} ${CUDA_CUDA_LIB} ${TORCH_CUDA_LIBRARIES})
+ foreach(LIB IN LISTS TEST_JIT_LIBS)
+ message("LIB: ${LIB}")
+ if(LIB MATCHES "stubs")
+ message("Filtering ${LIB} from being set in test_jit's RPATH, because it appears to point to the CUDA stubs directory, which should not be RPATHed.")
+ else()
+ cmake_path(GET LIB PARENT_PATH LIB_PATH)
+ message("LIBPATH: ${LIB_PATH}")
+ list(APPEND TEST_JIT_RPATH ${LIB_PATH})
+ endif()
+ endforeach()
+ message("TEST_JIT_RPATH: ${TEST_JIT_RPATH}")
+ set_target_properties(test_jit PROPERTIES INSTALL_RPATH_USE_LINK_PATH FALSE)
+ set_target_properties(test_jit PROPERTIES INSTALL_RPATH "${TEST_JIT_RPATH}")
+
target_compile_definitions(test_jit PRIVATE USE_CUDA)
elseif(USE_ROCM)
target_link_libraries(test_jit PRIVATE
diff -Nru pytorch.orig/test/cpp/rpc/CMakeLists.txt pytorch/test/cpp/rpc/CMakeLists.txt
--- pytorch.orig/test/cpp/rpc/CMakeLists.txt 2021-11-17 11:46:02.991350652 +0100
+++ pytorch/test/cpp/rpc/CMakeLists.txt 2021-11-18 19:06:30.502306793 +0100
@@ -39,6 +39,22 @@
${CUDA_CUDA_LIB}
${TORCH_CUDA_LIBRARIES})
+ # Make sure the CUDA stubs folder doesn't end up in the RPATH of test_cpp_rpc:
+ set(CPP_RPC_LIBS ${CUDA_LIBRARIES} ${CUDA_NVRTC_LIB} ${CUDA_CUDA_LIB} ${TORCH_CUDA_LIBRARIES})
+ foreach(LIB IN LISTS CPP_RPC_LIBS)
+ message("LIB: ${LIB}")
+ if(LIB MATCHES "stubs")
+ message("Filtering ${LIB} from being set in caffe2_nvrtc's RPATH, because it appears to point to the CUDA stubs directory, which should not be RPATHed.")
+ else()
+ cmake_path(GET LIB PARENT_PATH LIB_PATH)
+ message("LIBPATH: ${LIB_PATH}")
+ list(APPEND CPP_RPC_RPATH ${LIB_PATH})
+ endif()
+ endforeach()
+ message("CPP_RPC_RPATH: ${CPP_RPC_RPATH}")
+ set_target_properties(test_cpp_rpc PROPERTIES INSTALL_RPATH_USE_LINK_PATH FALSE)
+ set_target_properties(test_cpp_rpc PROPERTIES INSTALL_RPATH "${CPP_RPC_RPATH}")
+
target_compile_definitions(test_cpp_rpc PRIVATE "USE_CUDA")
endif()
diff -Nru pytorch.orig/test/cpp/tensorexpr/CMakeLists.txt pytorch/test/cpp/tensorexpr/CMakeLists.txt
--- pytorch.orig/test/cpp/tensorexpr/CMakeLists.txt 2021-11-17 11:46:02.993350674 +0100
+++ pytorch/test/cpp/tensorexpr/CMakeLists.txt 2021-11-18 19:06:00.988984273 +0100
@@ -62,6 +62,24 @@
${CUDA_CUDA_LIB}
${TORCH_CUDA_LIBRARIES})
target_compile_definitions(tutorial_tensorexpr PRIVATE USE_CUDA)
+
+ # Make sure the CUDA stubs folder doesn't end up in the RPATH of tutorial_tensorexpr:
+ set(CUDA_LINK_LIBS ${CUDA_LIBRARIES} ${CUDA_NVRTC_LIB} ${CUDA_CUDA_LIB} ${TORCH_CUDA_LIBRARIES})
+ foreach(LIB IN LISTS CUDA_LINK_LIBS)
+ message("LIB: ${LIB}")
+ if(LIB MATCHES "stubs")
+ message("Filtering ${LIB} from being set in test_tensorexpr and tutorial_tensorexpr RPATH, because it appears to point to the CUDA stubs directory, which should not be RPATHed.")
+ else()
+ cmake_path(GET LIB PARENT_PATH LIB_PATH)
+ message("LIBPATH: ${LIB_PATH}")
+ list(APPEND TENSOREXPR_RPATH ${LIB_PATH})
+ endif()
+ endforeach()
+ message("TENSOREXPR_RPATH: ${TENSOREXPR_RPATH}")
+ set_target_properties(test_tensorexpr PROPERTIES INSTALL_RPATH_USE_LINK_PATH FALSE)
+ set_target_properties(test_tensorexpr PROPERTIES INSTALL_RPATH "${TENSOREXPR_RPATH}")
+ set_target_properties(tutorial_tensorexpr PROPERTIES INSTALL_RPATH_USE_LINK_PATH FALSE)
+ set_target_properties(tutorial_tensorexpr PROPERTIES INSTALL_RPATH "${TENSOREXPR_RPATH}")
elseif(USE_ROCM)
target_link_libraries(test_tensorexpr PRIVATE
${ROCM_HIPRTC_LIB}
name = 'PyTorch'
version = '1.11'
versionsuffix = '-Python-%(pyver)s'
homepage = 'https://pytorch.org/'
description = """Tensors and Dynamic neural networks in Python with strong GPU acceleration.
PyTorch is a deep learning framework that puts Python first."""
toolchain = {'name': 'goolf', 'version': '2021a.11'}
toolchainopts = {'openmp': True}
# toolchainopts = {'cstd': 'c++11'}
sources = [{
'filename': '%(name)s-%(version)s.tar.gz',
'git_config': {
'url': 'https://github.com/pytorch',
'repo_name': 'pytorch',
'tag': 'v1.11.0',
'recursive': True,
},
}]
patches = [
'PyTorch-1.7.0_avoid-nan-in-test-torch.patch',
'PyTorch-1.7.0_disable-dev-shm-test.patch',
# 'PyTorch-1.7.1_correctly-pass-jit_opt_level.patch',
'PyTorch-1.8.1_dont-use-gpu-ccc-in-test.patch',
# 'PyTorch-1.8.1_increase-distributed-test-timeout.patch',
'PyTorch-1.9.0_limit-world-size-for-zero-redundancy-opt-test.patch',
'PyTorch-1.10.0_fix-test-dataloader-fixed-affinity.patch',
# 'PyTorch-1.10.0_fix-alias-violation-in-bitwise-ops.patch',
# 'PyTorch-1.10.0_fix-faulty-asserts-and-skip-test.patch',
# 'PyTorch-1.10.0_fix-test-cond-cpu.patch',
# 'PyTorch-1.10.0_fix-vnni-detection.patch',
# 'PyTorch-1.10.0_increase_zero_optimizer_test_tolerance.patch',
# 'PyTorch-1.10.0_skip_failing_ops_tests.patch',
# 'PyTorch-1.10.0_skip_nan_tests_openblas.patch',
'PyTorch-1.10.0_skip_cmake_rpath.patch',
'PyTorch-1.11.0_fix_sharded_imports.patch',
# 'PyTorch-1.10.0_fix-gcc11-ideep.patch',
# 'PyTorch-1.10.0_fix_gcc11_nullpointer.patch',
# 'cub-lint.yaml.patch',
# 'cub-cub.cuh.patch',
# ('cub-cub-definitions.patch', 1),
# 'cub-context_gpu.patch',
# 'cub-accuracy_op.patch',
# 'cub-affine-channel_op.patch',
# 'cub-arg_ops.patch',
# 'cub-batch_moments_op.patch',
# 'cub-batch_sparse_to_dense_op.patch',
# 'cub-boolean_mask_ops.patch',
# 'cub-cross_entropy.patch',
# 'cub-distance_op.patch',
# 'cub-elementwise_div_op.patch',
# 'cub-elementwise_linear_op.patch',
# 'cub-elementwise_mul_op.patch',
# 'cub-elementwise_ops.patch',
# 'cub-find_op.patch',
# 'cub-generate_proposals_op.patch',
# 'cub-normalize_ops.patch',
# 'cub-one_hot_ops.patch',
# 'cub-pack_segments.patch',
# 'cub-prelu_op.patch',
# 'cub-reduce_front_back_max_ops.patch',
# 'cub-reduce_front_back_sum_mean_ops.patch',
# 'cub-reduction_ops.patch',
# 'cub-rmac_regions_op.patch',
# 'cub-segment_reduction_op_gpu.patch',
# 'cub-sequence_ops.patch',
# 'cub-softmax_ops.patch',
# 'cub-spatial_batch_norm_op_impl.patch',
# 'cub-adagrad_fused_op_gpu.patch',
# 'cub-adagrad_op_gpu.patch',
# 'cub-adam_op_gpu.patch',
# ('cub-cub_namespace.patch', 1),
# 'cub-reduce.patch',
# 'cub-math-gpu.patch',
# 'cub-CMake-Dependencies.patch',
'PyTorch-1.11.0_fix_skip_jit_cuda_fuser.patch',
'PyTorch-1.11.0_increase-distributed-test-timeout.patch',
'PyTorch-1.11.0_skip_failing_ops_tests.patch',
]
checksums = [
'5c8abb8c7f0cd910c2f49e9bc699811dd7996d2f435173627ea5f551dc545e92', # PyTorch-1.11.tar.gz
'b899aa94d9e60f11ee75a706563312ccefa9cf432756c470caa8e623991c8f18', # PyTorch-1.7.0_avoid-nan-in-test-torch.patch
'622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a', # PyTorch-1.7.0_disable-dev-shm-test.patch
'89ac7a8e9e7df2e64cf8404fe3a279f5e9b759fee41c9de3aaff9c22f385c2c6', # PyTorch-1.8.1_dont-use-gpu-ccc-in-test.patch
# PyTorch-1.9.0_limit-world-size-for-zero-redundancy-opt-test.patch
'ff573660913ce055e24cfd194ce747ba5685091c631cfd443eae2a99d56b57ea',
# PyTorch-1.10.0_fix-test-dataloader-fixed-affinity.patch
'313dca681f45ce3bc7c4557fdcdcbe0b77216d2c708fa30a2ec0e22c44876707',
'ac05943bb205623f91ef140aa00869efc5fe844184bd666bebf5405808610448', # PyTorch-1.10.0_skip_cmake_rpath.patch
'2e3e2093fce314a9ee9fb73ef44477f4c2cedfcf27570f585c6917ae434311f2', # PyTorch-1.11.0_fix_sharded_imports.patch
'91e67cd498918baafe3fd58e0ba04b610a3561d1d97cec2c934bfd48fffd8324', # PyTorch-1.11.0_fix_skip_jit_cuda_fuser.patch
# PyTorch-1.11.0_increase-distributed-test-timeout.patch
'bb9709590ea8bd329360ca345c70afb8ff028be80e112af7ee00abba58482316',
'88a312d4752fe72171a2292d0aa5438ada42b124be113015bb4969c83c723766', # PyTorch-1.11.0_skip_failing_ops_tests.patch
]
osdependencies = [OS_PKG_IBVERBS_DEV]
builddependencies = [
('CMake', '3.20.0'),
# ('hypothesis', '6.14.6'),
]
dependencies = [
('Ninja', '1.10.2'), # Required for JIT compilation of C++ extensions
('Python', '3.9.4'),
('protobuf', '3.17.3'),
('protobuf-python', '3.17.3', versionsuffix),
('pybind11', '2.6.2', versionsuffix),
('SciPy-Stack', '2021a', versionsuffix),
# ('typing-extensions', '3.10.0.0'),
('PyYAML', '5.4.1', versionsuffix),
('MPFR', '4.1.0'),
('GMP', '6.2.1'),
('numactl', '2.0.14',),
('FFmpeg', '4.3.2'),
('Pillow-SIMD', '9.0.1', versionsuffix),
('expecttest', '0.1.3', versionsuffix),
]
#custom_opts = ["USE_CUPTI_SO=1"]
configopts = 'CFLAGS="$CFLAGS -fopenmp" CXXFLAGS="$CXXFLAGS -fopenmp" LDFLAGS=-fopenmp'
excluded_tests = {
'': [
# Bad tests: https://github.com/pytorch/pytorch/issues/60260
'distributed/elastic/utils/distributed_test',
'distributed/elastic/multiprocessing/api_test',
# These tests fail on A10s at the very least, they time out forever no matter how long the timeout is.
# Possibly related to NCCL 2.8.3: https://docs.nvidia.com/deeplearning/nccl/release-notes/rel_2-8-3.html
# 'distributed/test_distributed_fork',
'distributed/test_distributed_spawn',
# Fails on A10s: https://github.com/pytorch/pytorch/issues/63079
'test_optim',
'test_jit', # fails on all systems
'test_jit_cuda_fuser', # fails on all systems
'test_jit_legacy', # fails on all systems
'test_jit_profiling', # fails on all systems
'test_jit_fuser_te', # fails on booster and dc
# 'test_xnnpack_integration',
'distributed/_shard/sharded_optim/test_sharded_optim', # fails on booster and dc
'distributed/_shard/sharded_tensor/ops/test_linear', # fails on booster and dc
'distributed/_shard/sharded_tensor/test_megatron_prototype', # fails on booster and dc
'distributions/test_distributions', # fails on all systems
'test_cpp_extensions_jit', # fails on al systems
'test_ops', # fails on booster, dc, jusuf (works on hdfml?)
'distributed/fsdp/test_fsdp_memory', # fails on jusuf and hdfml
'distributed/fsdp/test_fsdp_overlap', # fails on jusuf and hdfml
# Those tests fail when not running from a container or without latest patches
# 'distributed/rpc/test_tensorpipe_agent',
# 'test_autograd', # fails on jureca dc and deep
# 'test_cuda', # fails on jureca dc
# 'test_multiprocessing', # fails on jureca dc
# 'test_nn', # fails on jureca dc
# 'test_profiler', # fails on jureca dc
# 'test_quantization', # fails on jureca dc
'distributed/_shard/sharded_tensor/test_sharded_tensor', # fails on juwels cluster container and deep
# 'distributed/algorithms/test_join', # fails on deep and jureca dc
# 'distributed/fsdp/test_fsdp_checkpoint', # fails on deep and jureca dc
# 'distributed/fsdp/test_fsdp_core', # fails on deep and jureca dc
# 'distributed/fsdp/test_fsdp_freezing_weights', # fails on deep and jureca dc
# 'distributed/fsdp/test_fsdp_memory', # fails on deep
# 'distributed/fsdp/test_fsdp_multiple_forward', # fails on deep and jureca dc
# 'distributed/fsdp/test_fsdp_multiple_wrapping', # fails on deep and jureca dc
# 'distributed/fsdp/test_fsdp_overlap', # fails on deep
# 'distributed/fsdp/test_fsdp_pure_fp16', # fails on deep and jureca dc
# 'distributed/fsdp/test_fsdp_uneven', # fails on deep and jureca dc
# 'distributed/fsdp/test_wrap', # fails on deep and jureca dc
# 'distributed/optim/test_zero_redundancy_optimizer', # fails on deep and jureca dc
# 'distributed/rpc/cuda/test_tensorpipe_agent', # fails on deep
# 'distributed/rpc/test_faulty_agent', # fails on deep
# 'distributed/test_c10d_gloo', # fails on deep
'test_model_dump', # fails on deep
'test_unary_ufuncs', # fails on haicgu
# 'distributed/test_c10d_nccl', # fails on jureca dc
# 'distributed/test_c10d_spawn_nccl', # fails on jureca dc
# 'distributed/test_data_parallel', # fails on jureca dc
]
}
runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --continue-through-error --verbose %(excluded_tests)s'
# The readelf sanity check can be taken out once the TestRPATH test from https://github.com/pytorch/pytorch/pull/68912
# is accepted, since it is then checked as part of the PyTorch test suite
local_libcaffe2 = "$EBROOTPYTORCH/lib/python%%(pyshortver)s/site-packages/torch/lib/libcaffe2_nvrtc.%s" % SHLIB_EXT
sanity_check_commands = [
"python -c 'import torch'",
# "readelf -d %s | egrep 'RPATH|RUNPATH' | grep -v stubs" % local_libcaffe2,
]
tests = ['PyTorch-check-cpp-extension.py']
moduleclass = 'devel'
# Fixes a "NameError: name 'sharded_tensor' is not defined" error
# for the test_named_params_with_sharded_tensor test
# See https://github.com/pytorch/pytorch/pull/73309
From 012d490ed76d8af8538d310a508b0e09a91b7632 Mon Sep 17 00:00:00 2001
From: wanchaol <wanchaol@devvm3348.frc0.facebook.com>
Date: Wed, 23 Feb 2022 12:10:39 -0800
Subject: [PATCH] [shard] fix some imports in tests
This fix some imports in sharded optimizer tests
Differential Revision: [D34427252](https://our.internmc.facebook.com/intern/diff/D34427252/)
[ghstack-poisoned]
---
.../_shard/sharded_optim/test_sharded_optim.py | 9 ++++++---
1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/test/distributed/_shard/sharded_optim/test_sharded_optim.py b/test/distributed/_shard/sharded_optim/test_sharded_optim.py
index 085c928985eb..d3f1468aea3c 100644
--- a/test/distributed/_shard/sharded_optim/test_sharded_optim.py
+++ b/test/distributed/_shard/sharded_optim/test_sharded_optim.py
@@ -2,7 +2,10 @@
import torch
import torch.optim as optim
-import torch.distributed._shard.sharded_tensor
+from torch.distributed._shard import (
+ sharded_tensor,
+ shard_parameter
+)
from copy import deepcopy
from torch.distributed._shard.sharding_spec import (
@@ -77,8 +80,8 @@ def shard_parameter(self):
],
)
- sharded_tensor.shard_parameter(self.linear1, "weight", rowwise_sharding_spec)
- sharded_tensor.shard_parameter(self.linear2, "weight", colwise_sharding_spec)
+ shard_parameter(self.linear1, "weight", rowwise_sharding_spec)
+ shard_parameter(self.linear2, "weight", colwise_sharding_spec)
def forward(self, inp):
return self.linear2(self.gelu(self.linear1(inp)))
\ No newline at end of file
It seems the timeout for the distributed tests is set to low and spurious failures can be seen
Increase it by a factor of 6 similar to torch/testing/_internal/distributed/distributed_test.py
Original patch by Alexander Grund (TU Dresden), updated by Caspar van Leeuwen (SURF)
diff -Nru pytorch-1.11.0-rc3.orig/torch/testing/_internal/common_distributed.py pytorch-1.11.0-rc3/torch/testing/_internal/common_distributed.py
--- pytorch-1.11.0-rc3.orig/torch/testing/_internal/common_distributed.py 2022-02-24 18:07:16.414274654 +0100
+++ pytorch-1.11.0-rc3/torch/testing/_internal/common_distributed.py 2022-02-24 18:08:31.772851148 +0100
@@ -321,7 +321,7 @@
# TSAN runs much slower.
TIMEOUT_DEFAULT = 500
else:
- TIMEOUT_DEFAULT = 100
+ TIMEOUT_DEFAULT = 600
TIMEOUT_OVERRIDE = {"test_ddp_uneven_inputs": 400}
\ No newline at end of file
# Author: Caspar van Leeuwen, SURF
# Fixes failing tests due to use of TensorFloat32
# Setting NVIDIA_TF32_OVERRIDE=0 makes these tests pass, proving that TensorFloat32 is the issue
# We increase tolerances for the asserts to make these tests pass
diff -Nru pytorch_orig/test/distributed/_shard/sharded_tensor/ops/test_linear.py pytorch/test/distributed/_shard/sharded_tensor/ops/test_linear.py
--- pytorch_orig/test/distributed/_shard/sharded_tensor/ops/test_linear.py 2022-04-07 18:31:13.069599000 +0200
+++ pytorch/test/distributed/_shard/sharded_tensor/ops/test_linear.py 2022-04-07 18:32:32.877406000 +0200
@@ -77,7 +77,7 @@
local_output = local_linear(inp)
# Verify
- self.assertEqual(local_output, sharded_output)
+ self.assertEqual(local_output, sharded_output, rtol=0.02, atol=1e-03)
# Validate for torch.nn.functional.linear version.
local_output = torch.nn.functional.linear(
@@ -91,7 +91,7 @@
# for reshard. We need to squeeze the # of dimensions manually.
if inp.dim() == 1:
sharded_output = sharded_output.squeeze(reshard_spec.dim)
- self.assertEqual(local_output, sharded_output)
+ self.assertEqual(local_output, sharded_output, rtol=0.02, atol=1e-03)
# Compute loss and run backward pass.
local_output.sum().backward()
@@ -114,7 +114,7 @@
# Test backward gradient calculation.
self.assertEqual(sharded_linear.bias.grad, local_bias_grad)
- self.assertEqual(sharded_weight.grad, local_grad_narrowed)