From 49297bcd7b74392c6cebc943975391e98ea263d2 Mon Sep 17 00:00:00 2001 From: Alexandre Strube <a.strube@fz-juelich.de> Date: Tue, 19 Apr 2022 16:58:52 +0200 Subject: [PATCH] PyTorch --- .../Cartopy/Cartopy-0.20.0-GCCcore-11.2.0.eb | 2 +- ...11-gcccoremkl-11.2.0-2021.4.0-CUDA-11.5.eb | 106 ++++++------- .../PyTorch-1.11.0_fix_sharded_imports.patch | 44 ++++++ ...0_increase-distributed-test-timeout.patch} | 0 ...1.11.0_increase_test_tolerances_TF32.patch | 143 ++++++++++++++++++ 5 files changed, 233 insertions(+), 62 deletions(-) create mode 100644 Golden_Repo/p/PyTorch/PyTorch-1.11.0_fix_sharded_imports.patch rename Golden_Repo/p/PyTorch/{PyTorch-1.11.0_increas-distributed-test-timeout.patch => PyTorch-1.11.0_increase-distributed-test-timeout.patch} (100%) create mode 100644 Golden_Repo/p/PyTorch/PyTorch-1.11.0_increase_test_tolerances_TF32.patch diff --git a/Golden_Repo/c/Cartopy/Cartopy-0.20.0-GCCcore-11.2.0.eb b/Golden_Repo/c/Cartopy/Cartopy-0.20.0-GCCcore-11.2.0.eb index 532e489bd..edaaf6e97 100644 --- a/Golden_Repo/c/Cartopy/Cartopy-0.20.0-GCCcore-11.2.0.eb +++ b/Golden_Repo/c/Cartopy/Cartopy-0.20.0-GCCcore-11.2.0.eb @@ -20,7 +20,7 @@ dependencies = [ ('SciPy-bundle', '2021.10', '', ('gcccoremkl', '11.2.0-2021.4.0')), ('Shapely', '1.8.0'), ('lxml', '4.6.3'), - ('Pillow-SIMD', '8.3.1'), + ('Pillow-SIMD', '9.0.1'), ('PROJ', '8.1.0'), ('PyYAML', '5.4.1'), ] diff --git a/Golden_Repo/p/PyTorch/PyTorch-1.11-gcccoremkl-11.2.0-2021.4.0-CUDA-11.5.eb b/Golden_Repo/p/PyTorch/PyTorch-1.11-gcccoremkl-11.2.0-2021.4.0-CUDA-11.5.eb index 7fa9ef507..21bc0202b 100644 --- a/Golden_Repo/p/PyTorch/PyTorch-1.11-gcccoremkl-11.2.0-2021.4.0-CUDA-11.5.eb +++ b/Golden_Repo/p/PyTorch/PyTorch-1.11-gcccoremkl-11.2.0-2021.4.0-CUDA-11.5.eb @@ -35,6 +35,7 @@ patches = [ # 'PyTorch-1.10.0_skip_failing_ops_tests.patch', # 'PyTorch-1.10.0_skip_nan_tests_openblas.patch', 'PyTorch-1.10.0_skip_cmake_rpath.patch', + 'PyTorch-1.11.0_fix_sharded_imports.patch', # 'PyTorch-1.10.0_fix-gcc11-ideep.patch', # 'PyTorch-1.10.0_fix_gcc11_nullpointer.patch', # 'cub-lint.yaml.patch', @@ -75,26 +76,8 @@ patches = [ # 'cub-math-gpu.patch', # 'cub-CMake-Dependencies.patch', 'PyTorch-1.11.0_fix_skip_jit_cuda_fuser.patch', - 'PyTorch-1.11.0_increas-distributed-test-timeout.patch', + 'PyTorch-1.11.0_increase-distributed-test-timeout.patch', 'PyTorch-1.11.0_skip_failing_ops_tests.patch', - - - -] -checksums = [ - '7547d3d52ca7067f1ce82fa14d02c49e7ca9c9841cfbc1f1742ffe95c0bfd2d6', # PyTorch-1.11.tar.gz - 'b899aa94d9e60f11ee75a706563312ccefa9cf432756c470caa8e623991c8f18', # PyTorch-1.7.0_avoid-nan-in-test-torch.patch - '622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a', # PyTorch-1.7.0_disable-dev-shm-test.patch - '89ac7a8e9e7df2e64cf8404fe3a279f5e9b759fee41c9de3aaff9c22f385c2c6', # PyTorch-1.8.1_dont-use-gpu-ccc-in-test.patch - # PyTorch-1.9.0_limit-world-size-for-zero-redundancy-opt-test.patch - 'ff573660913ce055e24cfd194ce747ba5685091c631cfd443eae2a99d56b57ea', - # PyTorch-1.10.0_fix-test-dataloader-fixed-affinity.patch - '313dca681f45ce3bc7c4557fdcdcbe0b77216d2c708fa30a2ec0e22c44876707', - 'ac05943bb205623f91ef140aa00869efc5fe844184bd666bebf5405808610448', # PyTorch-1.10.0_skip_cmake_rpath.patch - '91e67cd498918baafe3fd58e0ba04b610a3561d1d97cec2c934bfd48fffd8324', # PyTorch-1.11.0_fix_skip_jit_cuda_fuser.patch - # PyTorch-1.11.0_increas-distributed-test-timeout.patch - 'bb9709590ea8bd329360ca345c70afb8ff028be80e112af7ee00abba58482316', - '88a312d4752fe72171a2292d0aa5438ada42b124be113015bb4969c83c723766', # PyTorch-1.11.0_skip_failing_ops_tests.patch ] osdependencies = [OS_PKG_IBVERBS_DEV] @@ -139,48 +122,49 @@ excluded_tests = { 'distributed/test_distributed_spawn', # Fails on A10s: https://github.com/pytorch/pytorch/issues/63079 'test_optim', - # Test from this suite timeout often. The process group backend is deprecated anyway - # 'distributed/rpc/test_process_group_agent', - 'test_jit', - 'test_jit_cuda_fuser', - 'test_jit_legacy', - 'test_jit_profiling', - 'test_xnnpack_integration', - 'distributed/_shard/sharded_optim/test_sharded_optim', - 'distributed/_shard/sharded_tensor/ops/test_linear', - 'distributed/_shard/sharded_tensor/test_megatron_prototype', - 'distributions/test_distributions', - 'test_cpp_extensions_jit', - 'distributed/rpc/test_tensorpipe_agent', - 'test_ops', - 'distributed/fsdp/test_fsdp_memory', # fails on hdfml - 'distributed/fsdp/test_fsdp_overlap', # fails on hdfml - 'test_autograd', # fails on jureca dc and deep - 'test_cuda', # fails on jureca dc - 'test_multiprocessing', # fails on jureca dc - 'test_nn', # fails on jureca dc - 'test_profiler', # fails on jureca dc - 'test_quantization', # fails on jureca dc - 'distributed/_shard/sharded_tensor/test_sharded_tensor', # fails on deep - 'distributed/algorithms/test_join', # fails on deep and jureca dc - 'distributed/fsdp/test_fsdp_checkpoint', # fails on deep and jureca dc - 'distributed/fsdp/test_fsdp_core', # fails on deep and jureca dc - 'distributed/fsdp/test_fsdp_freezing_weights', # fails on deep and jureca dc - 'distributed/fsdp/test_fsdp_memory', # fails on deep - 'distributed/fsdp/test_fsdp_multiple_forward', # fails on deep and jureca dc - 'distributed/fsdp/test_fsdp_multiple_wrapping', # fails on deep and jureca dc - 'distributed/fsdp/test_fsdp_overlap', # fails on deep - 'distributed/fsdp/test_fsdp_pure_fp16', # fails on deep and jureca dc - 'distributed/fsdp/test_fsdp_uneven', # fails on deep and jureca dc - 'distributed/fsdp/test_wrap', # fails on deep and jureca dc - 'distributed/optim/test_zero_redundancy_optimizer', # fails on deep and jureca dc - 'distributed/rpc/cuda/test_tensorpipe_agent', # fails on deep - 'distributed/rpc/test_faulty_agent', # fails on deep - 'distributed/test_c10d_gloo', # fails on deep - 'test_model_dump', # fails on deep - 'distributed/test_c10d_nccl', # fails on jureca dc - 'distributed/test_c10d_spawn_nccl', # fails on jureca dc - 'distributed/test_data_parallel', # fails on jureca dc + 'test_jit', # fails on all systems + 'test_jit_cuda_fuser', # fails on all systems + 'test_jit_legacy', # fails on all systems + 'test_jit_profiling', # fails on all systems + 'test_jit_fuser_te', # fails on booster and dc + # 'test_xnnpack_integration', + 'distributed/_shard/sharded_optim/test_sharded_optim', # fails on booster and dc + 'distributed/_shard/sharded_tensor/ops/test_linear', # fails on booster and dc + 'distributed/_shard/sharded_tensor/test_megatron_prototype', # fails on booster and dc + 'distributions/test_distributions', # fails on all systems + 'test_cpp_extensions_jit', # fails on al systems + 'test_ops', # fails on booster, dc, jusuf (works on hdfml?) + 'distributed/fsdp/test_fsdp_memory', # fails on jusuf and hdfml + 'distributed/fsdp/test_fsdp_overlap', # fails on jusuf and hdfml + + # Those tests fail when not running from a container or without latest patches + # 'distributed/rpc/test_tensorpipe_agent', + # 'test_autograd', # fails on jureca dc and deep + # 'test_cuda', # fails on jureca dc + # 'test_multiprocessing', # fails on jureca dc + # 'test_nn', # fails on jureca dc + # 'test_profiler', # fails on jureca dc + # 'test_quantization', # fails on jureca dc + 'distributed/_shard/sharded_tensor/test_sharded_tensor', # fails on juwels cluster container and deep + # 'distributed/algorithms/test_join', # fails on deep and jureca dc + # 'distributed/fsdp/test_fsdp_checkpoint', # fails on deep and jureca dc + # 'distributed/fsdp/test_fsdp_core', # fails on deep and jureca dc + # 'distributed/fsdp/test_fsdp_freezing_weights', # fails on deep and jureca dc + # 'distributed/fsdp/test_fsdp_memory', # fails on deep + # 'distributed/fsdp/test_fsdp_multiple_forward', # fails on deep and jureca dc + # 'distributed/fsdp/test_fsdp_multiple_wrapping', # fails on deep and jureca dc + # 'distributed/fsdp/test_fsdp_overlap', # fails on deep + # 'distributed/fsdp/test_fsdp_pure_fp16', # fails on deep and jureca dc + # 'distributed/fsdp/test_fsdp_uneven', # fails on deep and jureca dc + # 'distributed/fsdp/test_wrap', # fails on deep and jureca dc + # 'distributed/optim/test_zero_redundancy_optimizer', # fails on deep and jureca dc + # 'distributed/rpc/cuda/test_tensorpipe_agent', # fails on deep + # 'distributed/rpc/test_faulty_agent', # fails on deep + # 'distributed/test_c10d_gloo', # fails on deep + # 'test_model_dump', # fails on deep + # 'distributed/test_c10d_nccl', # fails on jureca dc + # 'distributed/test_c10d_spawn_nccl', # fails on jureca dc + # 'distributed/test_data_parallel', # fails on jureca dc ] } diff --git a/Golden_Repo/p/PyTorch/PyTorch-1.11.0_fix_sharded_imports.patch b/Golden_Repo/p/PyTorch/PyTorch-1.11.0_fix_sharded_imports.patch new file mode 100644 index 000000000..b1e854c38 --- /dev/null +++ b/Golden_Repo/p/PyTorch/PyTorch-1.11.0_fix_sharded_imports.patch @@ -0,0 +1,44 @@ +# Fixes a "NameError: name 'sharded_tensor' is not defined" error +# for the test_named_params_with_sharded_tensor test +# See https://github.com/pytorch/pytorch/pull/73309 +From 012d490ed76d8af8538d310a508b0e09a91b7632 Mon Sep 17 00:00:00 2001 +From: wanchaol <wanchaol@devvm3348.frc0.facebook.com> +Date: Wed, 23 Feb 2022 12:10:39 -0800 +Subject: [PATCH] [shard] fix some imports in tests + +This fix some imports in sharded optimizer tests + +Differential Revision: [D34427252](https://our.internmc.facebook.com/intern/diff/D34427252/) + +[ghstack-poisoned] +--- + .../_shard/sharded_optim/test_sharded_optim.py | 9 ++++++--- + 1 file changed, 6 insertions(+), 3 deletions(-) + +diff --git a/test/distributed/_shard/sharded_optim/test_sharded_optim.py b/test/distributed/_shard/sharded_optim/test_sharded_optim.py +index 085c928985eb..d3f1468aea3c 100644 +--- a/test/distributed/_shard/sharded_optim/test_sharded_optim.py ++++ b/test/distributed/_shard/sharded_optim/test_sharded_optim.py +@@ -2,7 +2,10 @@ + + import torch + import torch.optim as optim +-import torch.distributed._shard.sharded_tensor ++from torch.distributed._shard import ( ++ sharded_tensor, ++ shard_parameter ++) + + from copy import deepcopy + from torch.distributed._shard.sharding_spec import ( +@@ -77,8 +80,8 @@ def shard_parameter(self): + ], + ) + +- sharded_tensor.shard_parameter(self.linear1, "weight", rowwise_sharding_spec) +- sharded_tensor.shard_parameter(self.linear2, "weight", colwise_sharding_spec) ++ shard_parameter(self.linear1, "weight", rowwise_sharding_spec) ++ shard_parameter(self.linear2, "weight", colwise_sharding_spec) + + def forward(self, inp): + return self.linear2(self.gelu(self.linear1(inp))) \ No newline at end of file diff --git a/Golden_Repo/p/PyTorch/PyTorch-1.11.0_increas-distributed-test-timeout.patch b/Golden_Repo/p/PyTorch/PyTorch-1.11.0_increase-distributed-test-timeout.patch similarity index 100% rename from Golden_Repo/p/PyTorch/PyTorch-1.11.0_increas-distributed-test-timeout.patch rename to Golden_Repo/p/PyTorch/PyTorch-1.11.0_increase-distributed-test-timeout.patch diff --git a/Golden_Repo/p/PyTorch/PyTorch-1.11.0_increase_test_tolerances_TF32.patch b/Golden_Repo/p/PyTorch/PyTorch-1.11.0_increase_test_tolerances_TF32.patch new file mode 100644 index 000000000..3bce7e068 --- /dev/null +++ b/Golden_Repo/p/PyTorch/PyTorch-1.11.0_increase_test_tolerances_TF32.patch @@ -0,0 +1,143 @@ +# Author: Caspar van Leeuwen, SURF +# Fixes failing tests due to use of TensorFloat32 +# Setting NVIDIA_TF32_OVERRIDE=0 makes these tests pass, proving that TensorFloat32 is the issue +# We increase tolerances for the asserts to make these tests pass +diff -Nru pytorch_orig/test/distributed/_shard/sharded_tensor/ops/test_linear.py pytorch/test/distributed/_shard/sharded_tensor/ops/test_linear.py +--- pytorch_orig/test/distributed/_shard/sharded_tensor/ops/test_linear.py 2022-04-07 18:31:13.069599000 +0200 ++++ pytorch/test/distributed/_shard/sharded_tensor/ops/test_linear.py 2022-04-07 18:32:32.877406000 +0200 +@@ -77,7 +77,7 @@ + local_output = local_linear(inp) + + # Verify +- self.assertEqual(local_output, sharded_output) ++ self.assertEqual(local_output, sharded_output, rtol=0.02, atol=1e-03) + + # Validate for torch.nn.functional.linear version. + local_output = torch.nn.functional.linear( +@@ -91,7 +91,7 @@ + # for reshard. We need to squeeze the # of dimensions manually. + if inp.dim() == 1: + sharded_output = sharded_output.squeeze(reshard_spec.dim) +- self.assertEqual(local_output, sharded_output) ++ self.assertEqual(local_output, sharded_output, rtol=0.02, atol=1e-03) + + # Compute loss and run backward pass. + local_output.sum().backward() +@@ -114,7 +114,7 @@ + + # Test backward gradient calculation. + self.assertEqual(sharded_linear.bias.grad, local_bias_grad) +- self.assertEqual(sharded_weight.grad, local_grad_narrowed) ++ self.assertEqual(sharded_weight.grad, local_grad_narrowed, rtol=0.01, atol=1e-03) + + # Test optimizer. + previous = local_linear.weight.clone().detach() +@@ -135,7 +135,7 @@ + ) + self.assertEqual(sharded_weight.size(), local_weight_narrowed.size()) + self.assertNotEqual(previous_sharded_weight, sharded_weight) +- self.assertEqual(sharded_weight, local_weight_narrowed) ++ self.assertEqual(sharded_weight, local_weight_narrowed, rtol=0.01, atol=1e-04) + self.assertNotEqual(previous_sharded_bias, sharded_linear.bias) + self.assertEqual(sharded_linear.bias, local_linear.bias) + +diff -Nru pytorch_orig/test/distributed/_shard/sharded_tensor/test_megatron_prototype.py pytorch/test/distributed/_shard/sharded_tensor/test_megatron_prototype.py +--- pytorch_orig/test/distributed/_shard/sharded_tensor/test_megatron_prototype.py 2022-04-07 18:31:13.091710000 +0200 ++++ pytorch/test/distributed/_shard/sharded_tensor/test_megatron_prototype.py 2022-04-07 18:41:03.744644000 +0200 +@@ -113,7 +113,7 @@ + local_output = local_megatron_lm(inp) + + # Verify +- self.assertEqual(local_output, sharded_output) ++ self.assertEqual(local_output, sharded_output, rtol=0.01, atol=1e-03) + + # Compute loss and run backward pass. + local_output.sum().backward() +@@ -161,9 +161,9 @@ + ) + + # Test backward gradient calculation. +- self.assertEqual(sharded_weight_fc1.grad, local_grad_narrowed_fc1) +- self.assertEqual(sharded_weight_fc2.grad, local_grad_narrowed_fc2) +- self.assertEqual(bias_grad_fc1, local_bias_grad_fc1) ++ self.assertEqual(sharded_weight_fc1.grad, local_grad_narrowed_fc1, rtol=0.01, atol=2e-03) ++ self.assertEqual(sharded_weight_fc2.grad, local_grad_narrowed_fc2, rtol=0.01, atol=1e-03) ++ self.assertEqual(bias_grad_fc1, local_bias_grad_fc1, rtol=0.01, atol=2e-02) + self.assertEqual(bias_grad_fc2, local_bias_grad_fc2) + + # Test optimizer. +@@ -171,7 +171,7 @@ + local_bias_fc1, local_bias_fc2 = _get_bias(local_megatron_lm) + self.assertEqual(bias_fc1, local_bias_fc1) + self.assertEqual(bias_fc2, local_bias_fc2) +- self.assertEqual(bias_fc1.grad, local_bias_fc1.grad) ++ self.assertEqual(bias_fc1.grad, local_bias_fc1.grad, rtol=0.01, atol=2e-02) + self.assertEqual(bias_fc2.grad, local_bias_fc2.grad) + previous_sharded_weight_fc1 = sharded_weight_fc1.clone() + previous_sharded_weight_fc2 = sharded_weight_fc2.clone() +@@ -197,13 +197,13 @@ + self.assertEqual(sharded_weight_fc2.size(), local_weight_fc2_narrowed.size()) + self.assertNotEqual(previous_sharded_weight_fc1, sharded_weight_fc1) + self.assertNotEqual(previous_sharded_weight_fc2, sharded_weight_fc2) +- self.assertEqual(sharded_weight_fc1, local_weight_fc1_narrowed) +- self.assertEqual(sharded_weight_fc2, local_weight_fc2_narrowed) ++ self.assertEqual(sharded_weight_fc1, local_weight_fc1_narrowed, rtol=0.01, atol=1e-03) ++ self.assertEqual(sharded_weight_fc2, local_weight_fc2_narrowed, rtol=0.01, atol=1e-03) + + # Test bias value after optimizer. + local_bias_fc1, local_bias_fc2 = _get_bias(local_megatron_lm) + self.assertNotEqual(previous_bias_fc1, bias_fc1) +- self.assertEqual(bias_fc1, local_bias_fc1) ++ self.assertEqual(bias_fc1, local_bias_fc1, rtol=0.01, atol=1e-03) + self.assertNotEqual(previous_bias_fc2, bias_fc2) + self.assertEqual(bias_fc2, local_bias_fc2) + +diff -Nru pytorch_orig/test/test_stateless.py pytorch/test/test_stateless.py +--- pytorch_orig/test/test_stateless.py 2022-04-07 18:31:13.029968000 +0200 ++++ pytorch/test/test_stateless.py 2022-04-07 18:43:46.723968000 +0200 +@@ -42,7 +42,7 @@ + # existing params in module. So here we expect the result to be the + # same as the input if the weight swapping went well. + res = _stateless.functional_call(module, parameters, x) +- self.assertEqual(x, res) ++ self.assertEqual(x, res, rtol=1e-04, atol=1e-04) + # check that the weight remain unmodified + cur_weight = to_check.l1.weight + uur_buffer = to_check.buffer +c PyTorch-1.11.0_increase_test_tolerances_TF32.patch +rig/test/test_jit_fuser_te.py pytorch/test/test_jit_fuser_te.py +--- pytorch_orig/test/test_jit_fuser_te.py 2022-04-07 18:31:13.046680000 +0200 ++++ pytorch/test/test_jit_fuser_te.py 2022-04-12 18:21:00.355114000 +0200 +@@ -956,7 +956,7 @@ + def test_lstm_traced(self): + for device in self.devices: + inputs = get_lstm_inputs(device) +- ge = self.checkTrace(LSTMCellF, inputs) ++ ge = self.checkTrace(LSTMCellF, inputs, atol=1e-4, rtol=1e-5) + graph = ge.graph_for(*inputs) + fusion_groups = self.findFusionGroups(graph) + # TODO: chunk +diff -Nru pytorch_orig/torch/testing/_internal/jit_utils.py pytorch/torch/testing/_internal/jit_utils.py +--- pytorch_orig/torch/testing/_internal/jit_utils.py 2022-04-07 18:28:54.339477000 +0200 ++++ pytorch/torch/testing/_internal/jit_utils.py 2022-04-12 18:19:59.614272000 +0200 +@@ -525,7 +525,7 @@ + def checkTrace(self, func, reference_tensors, input_tensors=None, + drop=None, allow_unused=False, verbose=False, + inputs_require_grads=True, check_tolerance=1e-5, export_import=True, +- _force_outplace=False): ++ _force_outplace=False, rtol=None, atol=None): + + # TODO: check gradients for parameters, not just inputs + def allSum(vs): +@@ -618,7 +618,10 @@ + + self.assertEqual(outputs, outputs_ge) + if inputs_require_grads: +- self.assertEqual(grads, grads_ge) ++ if atol is not None and rtol is not None: ++ self.assertEqual(grads, grads_ge, atol=atol, rtol=rtol) ++ else: ++ self.assertEqual(grads, grads_ge) + for g2, g2_ge in zip(grads2, grads2_ge): + if g2 is None and g2_ge is None: + continue \ No newline at end of file -- GitLab