From 49297bcd7b74392c6cebc943975391e98ea263d2 Mon Sep 17 00:00:00 2001
From: Alexandre Strube <a.strube@fz-juelich.de>
Date: Tue, 19 Apr 2022 16:58:52 +0200
Subject: [PATCH] PyTorch

---
 .../Cartopy/Cartopy-0.20.0-GCCcore-11.2.0.eb  |   2 +-
 ...11-gcccoremkl-11.2.0-2021.4.0-CUDA-11.5.eb | 106 ++++++-------
 .../PyTorch-1.11.0_fix_sharded_imports.patch  |  44 ++++++
 ...0_increase-distributed-test-timeout.patch} |   0
 ...1.11.0_increase_test_tolerances_TF32.patch | 143 ++++++++++++++++++
 5 files changed, 233 insertions(+), 62 deletions(-)
 create mode 100644 Golden_Repo/p/PyTorch/PyTorch-1.11.0_fix_sharded_imports.patch
 rename Golden_Repo/p/PyTorch/{PyTorch-1.11.0_increas-distributed-test-timeout.patch => PyTorch-1.11.0_increase-distributed-test-timeout.patch} (100%)
 create mode 100644 Golden_Repo/p/PyTorch/PyTorch-1.11.0_increase_test_tolerances_TF32.patch

diff --git a/Golden_Repo/c/Cartopy/Cartopy-0.20.0-GCCcore-11.2.0.eb b/Golden_Repo/c/Cartopy/Cartopy-0.20.0-GCCcore-11.2.0.eb
index 532e489bd..edaaf6e97 100644
--- a/Golden_Repo/c/Cartopy/Cartopy-0.20.0-GCCcore-11.2.0.eb
+++ b/Golden_Repo/c/Cartopy/Cartopy-0.20.0-GCCcore-11.2.0.eb
@@ -20,7 +20,7 @@ dependencies = [
     ('SciPy-bundle', '2021.10', '', ('gcccoremkl', '11.2.0-2021.4.0')),
     ('Shapely', '1.8.0'),
     ('lxml', '4.6.3'),
-    ('Pillow-SIMD', '8.3.1'),
+    ('Pillow-SIMD', '9.0.1'),
     ('PROJ', '8.1.0'),
     ('PyYAML', '5.4.1'),
 ]
diff --git a/Golden_Repo/p/PyTorch/PyTorch-1.11-gcccoremkl-11.2.0-2021.4.0-CUDA-11.5.eb b/Golden_Repo/p/PyTorch/PyTorch-1.11-gcccoremkl-11.2.0-2021.4.0-CUDA-11.5.eb
index 7fa9ef507..21bc0202b 100644
--- a/Golden_Repo/p/PyTorch/PyTorch-1.11-gcccoremkl-11.2.0-2021.4.0-CUDA-11.5.eb
+++ b/Golden_Repo/p/PyTorch/PyTorch-1.11-gcccoremkl-11.2.0-2021.4.0-CUDA-11.5.eb
@@ -35,6 +35,7 @@ patches = [
     # 'PyTorch-1.10.0_skip_failing_ops_tests.patch',
     # 'PyTorch-1.10.0_skip_nan_tests_openblas.patch',
     'PyTorch-1.10.0_skip_cmake_rpath.patch',
+    'PyTorch-1.11.0_fix_sharded_imports.patch',
     # 'PyTorch-1.10.0_fix-gcc11-ideep.patch',
     # 'PyTorch-1.10.0_fix_gcc11_nullpointer.patch',
     # 'cub-lint.yaml.patch',
@@ -75,26 +76,8 @@ patches = [
     # 'cub-math-gpu.patch',
     # 'cub-CMake-Dependencies.patch',
     'PyTorch-1.11.0_fix_skip_jit_cuda_fuser.patch',
-    'PyTorch-1.11.0_increas-distributed-test-timeout.patch',
+    'PyTorch-1.11.0_increase-distributed-test-timeout.patch',
     'PyTorch-1.11.0_skip_failing_ops_tests.patch',
-
-
-
-]
-checksums = [
-    '7547d3d52ca7067f1ce82fa14d02c49e7ca9c9841cfbc1f1742ffe95c0bfd2d6',  # PyTorch-1.11.tar.gz
-    'b899aa94d9e60f11ee75a706563312ccefa9cf432756c470caa8e623991c8f18',  # PyTorch-1.7.0_avoid-nan-in-test-torch.patch
-    '622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a',  # PyTorch-1.7.0_disable-dev-shm-test.patch
-    '89ac7a8e9e7df2e64cf8404fe3a279f5e9b759fee41c9de3aaff9c22f385c2c6',  # PyTorch-1.8.1_dont-use-gpu-ccc-in-test.patch
-    # PyTorch-1.9.0_limit-world-size-for-zero-redundancy-opt-test.patch
-    'ff573660913ce055e24cfd194ce747ba5685091c631cfd443eae2a99d56b57ea',
-    # PyTorch-1.10.0_fix-test-dataloader-fixed-affinity.patch
-    '313dca681f45ce3bc7c4557fdcdcbe0b77216d2c708fa30a2ec0e22c44876707',
-    'ac05943bb205623f91ef140aa00869efc5fe844184bd666bebf5405808610448',  # PyTorch-1.10.0_skip_cmake_rpath.patch
-    '91e67cd498918baafe3fd58e0ba04b610a3561d1d97cec2c934bfd48fffd8324',  # PyTorch-1.11.0_fix_skip_jit_cuda_fuser.patch
-    # PyTorch-1.11.0_increas-distributed-test-timeout.patch
-    'bb9709590ea8bd329360ca345c70afb8ff028be80e112af7ee00abba58482316',
-    '88a312d4752fe72171a2292d0aa5438ada42b124be113015bb4969c83c723766',  # PyTorch-1.11.0_skip_failing_ops_tests.patch
 ]
 
 osdependencies = [OS_PKG_IBVERBS_DEV]
@@ -139,48 +122,49 @@ excluded_tests = {
         'distributed/test_distributed_spawn',
         # Fails on A10s: https://github.com/pytorch/pytorch/issues/63079
         'test_optim',
-        # Test from this suite timeout often. The process group backend is deprecated anyway
-        # 'distributed/rpc/test_process_group_agent',
-        'test_jit',
-        'test_jit_cuda_fuser',
-        'test_jit_legacy',
-        'test_jit_profiling',
-        'test_xnnpack_integration',
-        'distributed/_shard/sharded_optim/test_sharded_optim',
-        'distributed/_shard/sharded_tensor/ops/test_linear',
-        'distributed/_shard/sharded_tensor/test_megatron_prototype',
-        'distributions/test_distributions',
-        'test_cpp_extensions_jit',
-        'distributed/rpc/test_tensorpipe_agent',
-        'test_ops',
-        'distributed/fsdp/test_fsdp_memory',  # fails on hdfml
-        'distributed/fsdp/test_fsdp_overlap',  # fails on hdfml
-        'test_autograd',  # fails on jureca dc and deep
-        'test_cuda',  # fails on jureca dc
-        'test_multiprocessing',  # fails on jureca dc
-        'test_nn',  # fails on jureca dc
-        'test_profiler',  # fails on jureca dc
-        'test_quantization',  # fails on jureca dc
-        'distributed/_shard/sharded_tensor/test_sharded_tensor',  # fails on deep
-        'distributed/algorithms/test_join',  # fails on deep and jureca dc
-        'distributed/fsdp/test_fsdp_checkpoint',  # fails on deep and jureca dc
-        'distributed/fsdp/test_fsdp_core',  # fails on deep and jureca dc
-        'distributed/fsdp/test_fsdp_freezing_weights',  # fails on deep and jureca dc
-        'distributed/fsdp/test_fsdp_memory',  # fails on deep
-        'distributed/fsdp/test_fsdp_multiple_forward',  # fails on deep and jureca dc
-        'distributed/fsdp/test_fsdp_multiple_wrapping',  # fails on deep and jureca dc
-        'distributed/fsdp/test_fsdp_overlap',  # fails on deep
-        'distributed/fsdp/test_fsdp_pure_fp16',  # fails on deep and jureca dc
-        'distributed/fsdp/test_fsdp_uneven',  # fails on deep and jureca dc
-        'distributed/fsdp/test_wrap',  # fails on deep and jureca dc
-        'distributed/optim/test_zero_redundancy_optimizer',  # fails on deep and jureca dc
-        'distributed/rpc/cuda/test_tensorpipe_agent',  # fails on deep
-        'distributed/rpc/test_faulty_agent',  # fails on deep
-        'distributed/test_c10d_gloo',  # fails on deep
-        'test_model_dump',  # fails on deep
-        'distributed/test_c10d_nccl',  # fails on jureca dc
-        'distributed/test_c10d_spawn_nccl',  # fails on jureca dc
-        'distributed/test_data_parallel',  # fails on jureca dc
+        'test_jit',  # fails on all systems
+        'test_jit_cuda_fuser',  # fails on all systems
+        'test_jit_legacy',  # fails on all systems
+        'test_jit_profiling',  # fails on all systems
+        'test_jit_fuser_te',  # fails on booster and dc
+        # 'test_xnnpack_integration',
+        'distributed/_shard/sharded_optim/test_sharded_optim',  # fails on booster and dc
+        'distributed/_shard/sharded_tensor/ops/test_linear',  # fails on booster and dc
+        'distributed/_shard/sharded_tensor/test_megatron_prototype',  # fails on booster and dc
+        'distributions/test_distributions',  # fails on all systems
+        'test_cpp_extensions_jit',  # fails on al systems
+        'test_ops',  # fails on booster, dc, jusuf (works on hdfml?)
+        'distributed/fsdp/test_fsdp_memory',  # fails on jusuf and hdfml
+        'distributed/fsdp/test_fsdp_overlap',  # fails on jusuf and hdfml
+
+        # Those tests fail when not running from a container or without latest patches
+        # 'distributed/rpc/test_tensorpipe_agent',
+        # 'test_autograd',  # fails on jureca dc and deep
+        # 'test_cuda',  # fails on jureca dc
+        # 'test_multiprocessing',  # fails on jureca dc
+        # 'test_nn',  # fails on jureca dc
+        # 'test_profiler',  # fails on jureca dc
+        # 'test_quantization',  # fails on jureca dc
+        'distributed/_shard/sharded_tensor/test_sharded_tensor',  # fails on juwels cluster container and deep
+        # 'distributed/algorithms/test_join',  # fails on deep and jureca dc
+        # 'distributed/fsdp/test_fsdp_checkpoint',  # fails on deep and jureca dc
+        # 'distributed/fsdp/test_fsdp_core',  # fails on deep and jureca dc
+        # 'distributed/fsdp/test_fsdp_freezing_weights',  # fails on deep and jureca dc
+        # 'distributed/fsdp/test_fsdp_memory',  # fails on deep
+        # 'distributed/fsdp/test_fsdp_multiple_forward',  # fails on deep and jureca dc
+        # 'distributed/fsdp/test_fsdp_multiple_wrapping',  # fails on deep and jureca dc
+        # 'distributed/fsdp/test_fsdp_overlap',  # fails on deep
+        # 'distributed/fsdp/test_fsdp_pure_fp16',  # fails on deep and jureca dc
+        # 'distributed/fsdp/test_fsdp_uneven',  # fails on deep and jureca dc
+        # 'distributed/fsdp/test_wrap',  # fails on deep and jureca dc
+        # 'distributed/optim/test_zero_redundancy_optimizer',  # fails on deep and jureca dc
+        # 'distributed/rpc/cuda/test_tensorpipe_agent',  # fails on deep
+        # 'distributed/rpc/test_faulty_agent',  # fails on deep
+        # 'distributed/test_c10d_gloo',  # fails on deep
+        # 'test_model_dump',  # fails on deep
+        # 'distributed/test_c10d_nccl',  # fails on jureca dc
+        # 'distributed/test_c10d_spawn_nccl',  # fails on jureca dc
+        # 'distributed/test_data_parallel',  # fails on jureca dc
     ]
 }
 
diff --git a/Golden_Repo/p/PyTorch/PyTorch-1.11.0_fix_sharded_imports.patch b/Golden_Repo/p/PyTorch/PyTorch-1.11.0_fix_sharded_imports.patch
new file mode 100644
index 000000000..b1e854c38
--- /dev/null
+++ b/Golden_Repo/p/PyTorch/PyTorch-1.11.0_fix_sharded_imports.patch
@@ -0,0 +1,44 @@
+# Fixes a "NameError: name 'sharded_tensor' is not defined" error 
+# for the test_named_params_with_sharded_tensor test
+# See https://github.com/pytorch/pytorch/pull/73309
+From 012d490ed76d8af8538d310a508b0e09a91b7632 Mon Sep 17 00:00:00 2001
+From: wanchaol <wanchaol@devvm3348.frc0.facebook.com>
+Date: Wed, 23 Feb 2022 12:10:39 -0800
+Subject: [PATCH] [shard] fix some imports in tests
+
+This fix some imports in sharded optimizer tests
+
+Differential Revision: [D34427252](https://our.internmc.facebook.com/intern/diff/D34427252/)
+
+[ghstack-poisoned]
+---
+ .../_shard/sharded_optim/test_sharded_optim.py           | 9 ++++++---
+ 1 file changed, 6 insertions(+), 3 deletions(-)
+
+diff --git a/test/distributed/_shard/sharded_optim/test_sharded_optim.py b/test/distributed/_shard/sharded_optim/test_sharded_optim.py
+index 085c928985eb..d3f1468aea3c 100644
+--- a/test/distributed/_shard/sharded_optim/test_sharded_optim.py
++++ b/test/distributed/_shard/sharded_optim/test_sharded_optim.py
+@@ -2,7 +2,10 @@
+ 
+ import torch
+ import torch.optim as optim
+-import torch.distributed._shard.sharded_tensor
++from torch.distributed._shard import (
++    sharded_tensor,
++    shard_parameter
++)
+ 
+ from copy import deepcopy
+ from torch.distributed._shard.sharding_spec import (
+@@ -77,8 +80,8 @@ def shard_parameter(self):
+             ],
+         )
+ 
+-        sharded_tensor.shard_parameter(self.linear1, "weight", rowwise_sharding_spec)
+-        sharded_tensor.shard_parameter(self.linear2, "weight", colwise_sharding_spec)
++        shard_parameter(self.linear1, "weight", rowwise_sharding_spec)
++        shard_parameter(self.linear2, "weight", colwise_sharding_spec)
+ 
+     def forward(self, inp):
+         return self.linear2(self.gelu(self.linear1(inp)))
\ No newline at end of file
diff --git a/Golden_Repo/p/PyTorch/PyTorch-1.11.0_increas-distributed-test-timeout.patch b/Golden_Repo/p/PyTorch/PyTorch-1.11.0_increase-distributed-test-timeout.patch
similarity index 100%
rename from Golden_Repo/p/PyTorch/PyTorch-1.11.0_increas-distributed-test-timeout.patch
rename to Golden_Repo/p/PyTorch/PyTorch-1.11.0_increase-distributed-test-timeout.patch
diff --git a/Golden_Repo/p/PyTorch/PyTorch-1.11.0_increase_test_tolerances_TF32.patch b/Golden_Repo/p/PyTorch/PyTorch-1.11.0_increase_test_tolerances_TF32.patch
new file mode 100644
index 000000000..3bce7e068
--- /dev/null
+++ b/Golden_Repo/p/PyTorch/PyTorch-1.11.0_increase_test_tolerances_TF32.patch
@@ -0,0 +1,143 @@
+# Author: Caspar van Leeuwen, SURF
+# Fixes failing tests due to use of TensorFloat32
+# Setting NVIDIA_TF32_OVERRIDE=0 makes these tests pass, proving that TensorFloat32 is the issue
+# We increase tolerances for the asserts to make these tests pass
+diff -Nru pytorch_orig/test/distributed/_shard/sharded_tensor/ops/test_linear.py pytorch/test/distributed/_shard/sharded_tensor/ops/test_linear.py
+--- pytorch_orig/test/distributed/_shard/sharded_tensor/ops/test_linear.py	2022-04-07 18:31:13.069599000 +0200
++++ pytorch/test/distributed/_shard/sharded_tensor/ops/test_linear.py	2022-04-07 18:32:32.877406000 +0200
+@@ -77,7 +77,7 @@
+         local_output = local_linear(inp)
+ 
+         # Verify
+-        self.assertEqual(local_output, sharded_output)
++        self.assertEqual(local_output, sharded_output, rtol=0.02, atol=1e-03)
+ 
+         # Validate for torch.nn.functional.linear version.
+         local_output = torch.nn.functional.linear(
+@@ -91,7 +91,7 @@
+         # for reshard. We need to squeeze the # of dimensions manually.
+         if inp.dim() == 1:
+             sharded_output = sharded_output.squeeze(reshard_spec.dim)
+-        self.assertEqual(local_output, sharded_output)
++        self.assertEqual(local_output, sharded_output, rtol=0.02, atol=1e-03)
+ 
+         # Compute loss and run backward pass.
+         local_output.sum().backward()
+@@ -114,7 +114,7 @@
+ 
+         # Test backward gradient calculation.
+         self.assertEqual(sharded_linear.bias.grad, local_bias_grad)
+-        self.assertEqual(sharded_weight.grad, local_grad_narrowed)
++        self.assertEqual(sharded_weight.grad, local_grad_narrowed, rtol=0.01, atol=1e-03)
+ 
+         # Test optimizer.
+         previous = local_linear.weight.clone().detach()
+@@ -135,7 +135,7 @@
+         )
+         self.assertEqual(sharded_weight.size(), local_weight_narrowed.size())
+         self.assertNotEqual(previous_sharded_weight, sharded_weight)
+-        self.assertEqual(sharded_weight, local_weight_narrowed)
++        self.assertEqual(sharded_weight, local_weight_narrowed, rtol=0.01, atol=1e-04)
+         self.assertNotEqual(previous_sharded_bias, sharded_linear.bias)
+         self.assertEqual(sharded_linear.bias, local_linear.bias)
+ 
+diff -Nru pytorch_orig/test/distributed/_shard/sharded_tensor/test_megatron_prototype.py pytorch/test/distributed/_shard/sharded_tensor/test_megatron_prototype.py
+--- pytorch_orig/test/distributed/_shard/sharded_tensor/test_megatron_prototype.py	2022-04-07 18:31:13.091710000 +0200
++++ pytorch/test/distributed/_shard/sharded_tensor/test_megatron_prototype.py	2022-04-07 18:41:03.744644000 +0200
+@@ -113,7 +113,7 @@
+         local_output = local_megatron_lm(inp)
+ 
+         # Verify
+-        self.assertEqual(local_output, sharded_output)
++        self.assertEqual(local_output, sharded_output, rtol=0.01, atol=1e-03)
+ 
+         # Compute loss and run backward pass.
+         local_output.sum().backward()
+@@ -161,9 +161,9 @@
+         )
+ 
+         # Test backward gradient calculation.
+-        self.assertEqual(sharded_weight_fc1.grad, local_grad_narrowed_fc1)
+-        self.assertEqual(sharded_weight_fc2.grad, local_grad_narrowed_fc2)
+-        self.assertEqual(bias_grad_fc1, local_bias_grad_fc1)
++        self.assertEqual(sharded_weight_fc1.grad, local_grad_narrowed_fc1, rtol=0.01, atol=2e-03)
++        self.assertEqual(sharded_weight_fc2.grad, local_grad_narrowed_fc2, rtol=0.01, atol=1e-03)
++        self.assertEqual(bias_grad_fc1, local_bias_grad_fc1, rtol=0.01, atol=2e-02)
+         self.assertEqual(bias_grad_fc2, local_bias_grad_fc2)
+ 
+         # Test optimizer.
+@@ -171,7 +171,7 @@
+         local_bias_fc1, local_bias_fc2 = _get_bias(local_megatron_lm)
+         self.assertEqual(bias_fc1, local_bias_fc1)
+         self.assertEqual(bias_fc2, local_bias_fc2)
+-        self.assertEqual(bias_fc1.grad, local_bias_fc1.grad)
++        self.assertEqual(bias_fc1.grad, local_bias_fc1.grad, rtol=0.01, atol=2e-02)
+         self.assertEqual(bias_fc2.grad, local_bias_fc2.grad)
+         previous_sharded_weight_fc1 = sharded_weight_fc1.clone()
+         previous_sharded_weight_fc2 = sharded_weight_fc2.clone()
+@@ -197,13 +197,13 @@
+         self.assertEqual(sharded_weight_fc2.size(), local_weight_fc2_narrowed.size())
+         self.assertNotEqual(previous_sharded_weight_fc1, sharded_weight_fc1)
+         self.assertNotEqual(previous_sharded_weight_fc2, sharded_weight_fc2)
+-        self.assertEqual(sharded_weight_fc1, local_weight_fc1_narrowed)
+-        self.assertEqual(sharded_weight_fc2, local_weight_fc2_narrowed)
++        self.assertEqual(sharded_weight_fc1, local_weight_fc1_narrowed, rtol=0.01, atol=1e-03)
++        self.assertEqual(sharded_weight_fc2, local_weight_fc2_narrowed, rtol=0.01, atol=1e-03)
+ 
+         # Test bias value after optimizer.
+         local_bias_fc1, local_bias_fc2 = _get_bias(local_megatron_lm)
+         self.assertNotEqual(previous_bias_fc1, bias_fc1)
+-        self.assertEqual(bias_fc1, local_bias_fc1)
++        self.assertEqual(bias_fc1, local_bias_fc1, rtol=0.01, atol=1e-03)
+         self.assertNotEqual(previous_bias_fc2, bias_fc2)
+         self.assertEqual(bias_fc2, local_bias_fc2)
+ 
+diff -Nru pytorch_orig/test/test_stateless.py pytorch/test/test_stateless.py
+--- pytorch_orig/test/test_stateless.py	2022-04-07 18:31:13.029968000 +0200
++++ pytorch/test/test_stateless.py	2022-04-07 18:43:46.723968000 +0200
+@@ -42,7 +42,7 @@
+         # existing params in module. So here we expect the result to be the
+         # same as the input if the weight swapping went well.
+         res = _stateless.functional_call(module, parameters, x)
+-        self.assertEqual(x, res)
++        self.assertEqual(x, res, rtol=1e-04, atol=1e-04)
+         # check that the weight remain unmodified
+         cur_weight = to_check.l1.weight
+         uur_buffer = to_check.buffer
+c PyTorch-1.11.0_increase_test_tolerances_TF32.patch
+rig/test/test_jit_fuser_te.py pytorch/test/test_jit_fuser_te.py
+--- pytorch_orig/test/test_jit_fuser_te.py  2022-04-07 18:31:13.046680000 +0200
++++ pytorch/test/test_jit_fuser_te.py   2022-04-12 18:21:00.355114000 +0200
+@@ -956,7 +956,7 @@
+     def test_lstm_traced(self):
+         for device in self.devices:
+             inputs = get_lstm_inputs(device)
+-            ge = self.checkTrace(LSTMCellF, inputs)
++            ge = self.checkTrace(LSTMCellF, inputs, atol=1e-4, rtol=1e-5)
+             graph = ge.graph_for(*inputs)
+             fusion_groups = self.findFusionGroups(graph)
+             # TODO: chunk
+diff -Nru pytorch_orig/torch/testing/_internal/jit_utils.py pytorch/torch/testing/_internal/jit_utils.py
+--- pytorch_orig/torch/testing/_internal/jit_utils.py   2022-04-07 18:28:54.339477000 +0200
++++ pytorch/torch/testing/_internal/jit_utils.py    2022-04-12 18:19:59.614272000 +0200
+@@ -525,7 +525,7 @@
+     def checkTrace(self, func, reference_tensors, input_tensors=None,
+                    drop=None, allow_unused=False, verbose=False,
+                    inputs_require_grads=True, check_tolerance=1e-5, export_import=True,
+-                   _force_outplace=False):
++                   _force_outplace=False, rtol=None, atol=None):
+
+         # TODO: check gradients for parameters, not just inputs
+         def allSum(vs):
+@@ -618,7 +618,10 @@
+
+         self.assertEqual(outputs, outputs_ge)
+         if inputs_require_grads:
+-            self.assertEqual(grads, grads_ge)
++            if atol is not None and rtol is not None:
++                self.assertEqual(grads, grads_ge, atol=atol, rtol=rtol)
++            else:
++                self.assertEqual(grads, grads_ge)
+             for g2, g2_ge in zip(grads2, grads2_ge):
+                 if g2 is None and g2_ge is None:
+                     continue
\ No newline at end of file
-- 
GitLab