test

0bc8a6fc · Alexandre Strube · e8d4f180 · 0bc8a6fc · 0bc8a6fc · 0bc8a6fc
Commit 0bc8a6fc authored 4 years ago by Alexandre Strube
--- a/Golden_Repo/p/PyTorch/PyTorch-1.6.0_fix-test-dataloader-fixed-affinity.patch
+++ b/Golden_Repo/p/PyTorch/PyTorch-1.6.0_fix-test-dataloader-fixed-affinity.patch
+From c7c7460fd3a49a9d289394b80d9ecf61898edf49 Mon Sep 17 00:00:00 2001
+From: Alexander Grund <alexander.grund@tu-dresden.de>
+Date: Wed, 9 Sep 2020 08:47:03 +0200
+Subject: [PATCH] Choose test affinity based on current affinity
+---
+ test/test_dataloader.py | 18 ++++++++++++------
+ 1 file changed, 12 insertions(+), 6 deletions(-)
+diff --git a/test/test_dataloader.py b/test/test_dataloader.py
+index ca0c9e6cb511f..745942bcf01f2 100644
+--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
+@@ -2128,22 +2128,28 @@ def __iter__(self):
+         after = os.sched_getaffinity(0)
+         return iter(after)
+-
+-def worker_set_affinity(_):
+-    os.sched_setaffinity(0, [2])
+-
+-
+ @unittest.skipIf(
+     not hasattr(os, 'sched_setaffinity'),
+     "os.sched_setaffinity is not available")
+ class TestSetAffinity(TestCase):
+     def test_set_affinity_in_worker_init(self):
+        # Query the current affinity mask to avoid setting a disallowed one
+        old_affinity = os.sched_getaffinity(0)
+        if not old_affinity:
+            self.skipTest("No affinity information")
+        # Choose any
+        expected_affinity = list(old_affinity)[-1]
+
+        def worker_set_affinity(_):
+            os.sched_setaffinity(0, [expected_affinity])
+
+
+         dataset = SetAffinityDataset()
+         dataloader = torch.utils.data.DataLoader(
+             dataset, num_workers=2, worker_init_fn=worker_set_affinity)
+         for sample in dataloader:
+-            self.assertEqual(sample, [2])
+            self.assertEqual(sample, [expected_affinity])
--- a/Golden_Repo/p/PyTorch/PyTorch-1.7.0-gcccoremkl-2020-Python-3.8.5.eb
+++ b/Golden_Repo/p/PyTorch/PyTorch-1.7.0-gcccoremkl-2020-Python-3.8.5.eb
+name = 'PyTorch'
+version = '1.7.0'
+versionsuffix = '-Python-%(pyver)s'
+homepage = 'https://pytorch.org/'
+description = """Tensors and Dynamic neural networks in Python with strong GPU acceleration.
+PyTorch is a deep learning framework that puts Python first."""
+toolchain = {'name': 'gcccoremkl', 'version': '9.3.0-2020.2.254'}
+site_contacts = 'Alexandre Strube <a.strube@fz-juelich.de>'
+osdependencies = [OS_PKG_IBVERBS_DEV]
+local_cudaver = '11.0'
+parallel = 128  # apparently this is solemny ignored
+builddependencies = [
+    ('CMake', '3.18.0'),
+    ('hypothesis', '4.44.2', versionsuffix),
+]
+dependencies = [
+    ('Ninja', '1.10.0'),
+    ('Python', '3.8.5'),
+    ('protobuf', '3.13.0'),
+    ('protobuf-python', '3.13.0', versionsuffix),
+    ('pybind11', '2.5.0', versionsuffix),
+    ('SciPy-Stack', '2020', versionsuffix, ('gcccoremkl', '9.3.0-2020.2.254')),
+    ('MPFR', '4.1.0'),
+    ('GMP', '6.2.0'),
+    ('numactl', '2.0.13', '', True),
+    ('FFmpeg', '4.3.1'),
+    ('cuDNN', '8.0.2.39', '-CUDA-%s' % local_cudaver, True),
+    ('magma', '2.5.4'),
+    ('NCCL', '2.8.3-1', '-CUDA-%s' % local_cudaver),
+    ('LLVM', '10.0.1'),
+    ('future', '0.18.2', versionsuffix),
+    ('libvpx', '1.9.0'),
+]
+# default CUDA compute capabilities to use (override via --cuda-compute-capabilities)
+cuda_compute_capabilities = ['7.0', '7.5', '8.0']
+# PyTorch pulls in a bunch of submodules that are not releases.
+# We download the submodule revisions from their repos.
+# The list is generated by easybuild-framework/easybuild/scripts/createSubmoduleDeps.sh
+local_extract_cmd_pattern = 'tar -C pytorch-%%(version)s/third_party/%s --strip-components=1 -xf %%s'
+local_cudaver = '11.0'
+source_urls = ['https://github.com/pytorch/pytorch/archive']
+sources = [
+    'v%(version)s.tar.gz',  # PyTorch
+    {
+        'source_urls': ['https://github.com/Maratyszcza/FP16/archive'],
+        'download_filename': '4dfe081cf6bcd15db339cf2680b9281b8451eeb3.tar.gz',
+        'filename': 'FP16-20200514.tar.gz',
+        'extract_cmd': local_extract_cmd_pattern % 'FP16',
+    },
+    {
+        'source_urls': ['https://github.com/Maratyszcza/FXdiv/archive'],
+        'download_filename': 'b408327ac2a15ec3e43352421954f5b1967701d1.tar.gz',
+        'filename': 'FXdiv-20200417.tar.gz',
+        'extract_cmd': local_extract_cmd_pattern % 'FXdiv',
+    },
+    {
+        'source_urls': ['https://github.com/Maratyszcza/NNPACK/archive'],
+        'download_filename': '24b55303f5cf65d75844714513a0d1b1409809bd.tar.gz',
+        'filename': 'NNPACK-20191007.tar.gz',
+        'extract_cmd': local_extract_cmd_pattern % 'NNPACK',
+    },
+    {
+        'source_urls': ['https://github.com/pytorch/QNNPACK/archive'],
+        'download_filename': '7d2a4e9931a82adc3814275b6219a03e24e36b4c.tar.gz',
+        'filename': 'QNNPACK-20190828.tar.gz',
+        'extract_cmd': local_extract_cmd_pattern % 'QNNPACK',
+    },
+    {
+        'source_urls': ['https://github.com/google/XNNPACK/archive'],
+        'download_filename': '1b354636b5942826547055252f3b359b54acff95.tar.gz',
+        'filename': 'XNNPACK-20200323.tar.gz',
+        'extract_cmd': local_extract_cmd_pattern % 'XNNPACK',
+    },
+    {
+        'source_urls': ['https://github.com/google/benchmark/archive'],
+        'download_filename': '505be96ab23056580a3a2315abba048f4428b04e.tar.gz',
+        'filename': 'benchmark-20180606.tar.gz',
+        'extract_cmd': local_extract_cmd_pattern % 'benchmark',
+    },
+    {
+        'source_urls': ['https://github.com/pytorch/cpuinfo/archive'],
+        'download_filename': '63b254577ed77a8004a9be6ac707f3dccc4e1fd9.tar.gz',
+        'filename': 'cpuinfo-20200611.tar.gz',
+        'extract_cmd': local_extract_cmd_pattern % 'cpuinfo',
+    },
+    {
+        'source_urls': ['https://github.com/NVlabs/cub/archive'],
+        'download_filename': 'd106ddb991a56c3df1b6d51b2409e36ba8181ce4.tar.gz',
+        'filename': 'cub-20200512.tar.gz',
+        'extract_cmd': local_extract_cmd_pattern % 'cub',
+    },
+    {
+        'source_urls': ['https://github.com/eigenteam/eigen-git-mirror/archive'],
+        'download_filename': 'd41dc4dd74acce21fb210e7625d5d135751fa9e5.tar.gz',
+        'filename': 'eigen-20190125.tar.gz',
+        'extract_cmd': local_extract_cmd_pattern % 'eigen',
+    },
+    {
+        'source_urls': ['https://github.com/pytorch/fbgemm/archive'],
+        'download_filename': '1d710393d5b7588f5de3b83f51c22bbddf095229.tar.gz',
+        'filename': 'fbgemm-20200914.tar.gz',
+        'extract_cmd': local_extract_cmd_pattern % 'fbgemm',
+    },
+    {
+        'source_urls': ['https://github.com/asmjit/asmjit/archive'],
+        'download_filename': '9057aa30b620f0662ff51e2230c126a345063064.tar.gz',
+        'filename': 'asmjit-20200429.tar.gz',
+        'extract_cmd': local_extract_cmd_pattern % 'fbgemm/third_party/asmjit',
+    },
+    {
+        'source_urls': ['https://github.com/pytorch/cpuinfo/archive'],
+        'download_filename': 'd5e37adf1406cf899d7d9ec1d317c47506ccb970.tar.gz',
+        'filename': 'cpuinfo-20190201.tar.gz',
+        'extract_cmd': local_extract_cmd_pattern % 'fbgemm/third_party/cpuinfo',
+    },
+    {
+        'source_urls': ['https://github.com/google/googletest/archive'],
+        'download_filename': '0fc5466dbb9e623029b1ada539717d10bd45e99e.tar.gz',
+        'filename': 'googletest-20180925.tar.gz',
+        'extract_cmd': local_extract_cmd_pattern % 'fbgemm/third_party/googletest',
+    },
+    {
+        'source_urls': ['https://github.com/fmtlib/fmt/archive'],
+        'download_filename': 'cd4af11efc9c622896a3e4cb599fa28668ca3d05.tar.gz',
+        'filename': 'fmt-20200806.tar.gz',
+        'extract_cmd': local_extract_cmd_pattern % 'fmt',
+    },
+    {
+        'source_urls': ['https://github.com/houseroad/foxi/archive'],
+        'download_filename': '4aba696ec8f31794fd42880346dc586486205e0a.tar.gz',
+        'filename': 'foxi-20200922.tar.gz',
+        'extract_cmd': local_extract_cmd_pattern % 'foxi',
+    },
+    {
+        'source_urls': ['https://github.com/google/gemmlowp/archive'],
+        'download_filename': '3fb5c176c17c765a3492cd2f0321b0dab712f350.tar.gz',
+        'filename': 'gemmlowp-20181126.tar.gz',
+        'extract_cmd': local_extract_cmd_pattern % 'gemmlowp/gemmlowp',
+    },
+    {
+        'source_urls': ['https://github.com/facebookincubator/gloo/archive'],
+        'download_filename': '3dc0328fe6a9d47bd47c0c6ca145a0d8a21845c6.tar.gz',
+        'filename': 'gloo-20200918.tar.gz',
+        'extract_cmd': local_extract_cmd_pattern % 'gloo',
+    },
+    {
+        'source_urls': ['https://github.com/google/googletest/archive'],
+        'download_filename': '2fe3bd994b3189899d93f1d5a881e725e046fdc2.tar.gz',
+        'filename': 'googletest-20180831.tar.gz',
+        'extract_cmd': local_extract_cmd_pattern % 'googletest',
+    },
+    {
+        'source_urls': ['https://github.com/intel/ideep/archive'],
+        'download_filename': 'ba885200dbbc1f144c7b58eba487378eb324f281.tar.gz',
+        'filename': 'ideep-20200915.tar.gz',
+        'extract_cmd': local_extract_cmd_pattern % 'ideep',
+    },
+    {
+        'source_urls': ['https://github.com/intel/mkl-dnn/archive'],
+        'download_filename': '5ef631a030a6f73131c77892041042805a06064f.tar.gz',
+        'filename': 'mkl-dnn-20200909.tar.gz',
+        'extract_cmd': local_extract_cmd_pattern % 'ideep/mkl-dnn',
+    },
+    {
+        'source_urls': ['https://github.com/onnx/onnx/archive'],
+        'download_filename': 'a82c6a7010e2e332d8f74ad5b0c726fd47c85376.tar.gz',
+        'filename': 'onnx-20200531.tar.gz',
+        'extract_cmd': local_extract_cmd_pattern % 'onnx',
+    },
+    {
+        'source_urls': ['https://github.com/google/benchmark/archive'],
+        'download_filename': 'e776aa0275e293707b6a0901e0e8d8a8a3679508.tar.gz',
+        'filename': 'benchmark-20180525.tar.gz',
+        'extract_cmd': local_extract_cmd_pattern % 'onnx/third_party/benchmark',
+    },
+    {
+        'source_urls': ['https://github.com/onnx/onnx-tensorrt/archive'],
+        'download_filename': 'c153211418a7c57ce071d9ce2a41f8d1c85a878f.tar.gz',
+        'filename': 'onnx-tensorrt-20190916.tar.gz',
+        'extract_cmd': local_extract_cmd_pattern % 'onnx-tensorrt',
+    },
+    {
+        'source_urls': ['https://github.com/Maratyszcza/psimd/archive'],
+        'download_filename': '072586a71b55b7f8c584153d223e95687148a900.tar.gz',
+        'filename': 'psimd-20200517.tar.gz',
+        'extract_cmd': local_extract_cmd_pattern % 'psimd',
+    },
+    {
+        'source_urls': ['https://github.com/Maratyszcza/pthreadpool/archive'],
+        'download_filename': '029c88620802e1361ccf41d1970bd5b07fd6b7bb.tar.gz',
+        'filename': 'pthreadpool-20200615.tar.gz',
+        'extract_cmd': local_extract_cmd_pattern % 'pthreadpool',
+    },
+    {
+        'source_urls': ['https://github.com/Maratyszcza/PeachPy/archive'],
+        'download_filename': '07d8fde8ac45d7705129475c0f94ed8925b93473.tar.gz',
+        'filename': 'PeachPy-20180218.tar.gz',
+        'extract_cmd': local_extract_cmd_pattern % 'python-peachpy',
+    },
+    {
+        'source_urls': ['https://github.com/shibatch/sleef/archive'],
+        'download_filename': '7f523de651585fe25cade462efccca647dcc8d02.tar.gz',
+        'filename': 'sleef-20190730.tar.gz',
+        'extract_cmd': local_extract_cmd_pattern % 'sleef',
+    },
+    {
+        'source_urls': ['https://github.com/01org/tbb/archive'],
+        'download_filename': 'a51a90bc609bb73db8ea13841b5cf7aa4344d4a9.tar.gz',
+        'filename': 'tbb-20181009.tar.gz',
+        'extract_cmd': local_extract_cmd_pattern % 'tbb',
+    },
+    {
+        'source_urls': ['https://github.com/pytorch/tensorpipe/archive'],
+        'download_filename': '95ff9319161fcdb3c674d2bb63fac3e94095b343.tar.gz',
+        'filename': 'tensorpipe-20200928.tar.gz',
+        'extract_cmd': local_extract_cmd_pattern % 'tensorpipe',
+    },
+    {
+        'source_urls': ['https://github.com/google/googletest/archive'],
+        'download_filename': '2fe3bd994b3189899d93f1d5a881e725e046fdc2.tar.gz',
+        'filename': 'googletest-20180831.tar.gz',
+        'extract_cmd': local_extract_cmd_pattern % 'tensorpipe/third_party/googletest',
+    },
+    {
+        'source_urls': ['https://github.com/google/libnop/archive'],
+        'download_filename': 'aa95422ea8c409e3f078d2ee7708a5f59a8b9fa2.tar.gz',
+        'filename': 'libnop-20200723.tar.gz',
+        'extract_cmd': local_extract_cmd_pattern % 'tensorpipe/third_party/libnop',
+    },
+    {
+        'source_urls': ['https://github.com/libuv/libuv/archive'],
+        'download_filename': '02a9e1be252b623ee032a3137c0b0c94afbe6809.tar.gz',
+        'filename': 'libuv-20200419.tar.gz',
+        'extract_cmd': local_extract_cmd_pattern % 'tensorpipe/third_party/libuv',
+    },
+]
+patches = [
+    'PyTorch-1.6.0_fix-test-dataloader-fixed-affinity.patch',
+    'PyTorch-1.7.0_fix_test_DistributedDataParallel.patch',
+    'PyTorch-1.7.0_fix_typing_python38.patch',
+    'PyTorch-1.7.0_fix_remove_backports.patch',
+]
+excluded_tests = {
+    '': [
+        # Fails on HDFML. Probably needs a GPU and a network - needs NCCL
+        'distributed/test_distributed_fork',
+        # Fails on HDFML.
+        'distributed/test_distributed_spawn',
+        # Fails on JUWELS (cluster) with SIGXCPU and on JUWELSBOOSTER
+        'test_foreach',
+        'test_xnnpack_integration',
+        # Fails on JUSUF
+        'distributed/rpc/test_process_group_agent',
+        'distributed/rpc/test_tensorpipe_agent',
+        'test_autograd',
+        'test_jit',
+        # Disabling all distributed tests because I don't have the whole year. Each test takes 2 hours
+        'distributed/nn/jit/test_instantiator',
+        'distributed/rpc/test_faulty_agent',
+        'distributed/rpc/test_process_group_agent',
+        'distributed/rpc/test_tensorpipe_agent',
+        # This test fails everywhere: https://github.com/pytorch/pytorch/issues/41242
+        # 'test_cpp_extensions_jit',
+        # Throws illegal memory access due to float16: https://github.com/pytorch/pytorch/issues/41340
+        # 'test_torch',
+        # Potentially problematic save/load issue with test_lstm on only some machines. Tell users to verify save&load!
+        # https://github.com/pytorch/pytorch/issues/43209
+        # 'test_quantization',
+    ],
+}
+runtest = 'cd test && %(python)s run_test.py --verbose %(excluded_tests)s'
+sanity_check_commands = ["python -c 'import caffe2.python'"]
+tests = ['PyTorch-check-cpp-extension.py']
+moduleclass = 'devel'
--- a/Golden_Repo/p/PyTorch/PyTorch-1.7.0_fix_remove_backports.patch
+++ b/Golden_Repo/p/PyTorch/PyTorch-1.7.0_fix_remove_backports.patch
+--- setup.py.orig	2020-11-12 14:13:54.727306249 +0100
+++ setup.py	2020-11-12 14:14:42.105485851 +0100
+@@ -337,7 +337,7 @@
+ ################################################################################
+ # the list of runtime dependencies required by this built package
+-install_requires = ['future', 'typing_extensions', 'dataclasses']
+install_requires = ['future']
+ missing_pydep = '''
+ Missing build dependency: Unable to `import {importname}`.
--- a/Golden_Repo/p/PyTorch/PyTorch-1.7.0_fix_test_DistributedDataParallel.patch
+++ b/Golden_Repo/p/PyTorch/PyTorch-1.7.0_fix_test_DistributedDataParallel.patch
+From 4ee880e1645da7c581a04de6cfe30d911f659f57 Mon Sep 17 00:00:00 2001
+From: Alexander Grund <alexander.grund@tu-dresden.de>
+Date: Thu, 15 Oct 2020 14:30:03 +0200
+Subject: [PATCH] Distribute GPUs in round robin mode for distributed_test
+The ProcessGroupNCCL::barrier implementation assumes that when
+1 GPU/rank is used the GPU-Index equals the rank. Due to NCCL
+communicator reuse this then leads to rank 0 using the (kinda)
+temporary communicator while the other processes might use other GPUs
+leading to them trying to create a new communicator and waiting for
+rank 0 until that creates a new (potentially unrelated) one.
+See #46248 for details
+---
+ torch/testing/_internal/distributed/distributed_test.py | 6 ++----
+ 1 file changed, 2 insertions(+), 4 deletions(-)
+diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
+index ccaccbda529aa..4d7c23b613d7c 100644
+--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
+@@ -362,16 +362,14 @@ def _init_multigpu_helper(self):
+             """
+             nGPUs = torch.cuda.device_count()
+             world_size = dist.get_world_size()
+-            visible_devices = range(nGPUs)
+             if BACKEND == "nccl":
+                 apply_hack_for_nccl()
+             nGPUs_per_process = nGPUs // world_size
+             rank_to_GPU = {
+-                i: list(
+-                    visible_devices[i * nGPUs_per_process: (i + 1) * nGPUs_per_process]
+-                )
+                # Each rank has to get the GPU with the index equal to its rank
+                i: [i + gpu_num * world_size for gpu_num in range(nGPUs_per_process)]
+                 for i in range(world_size)
+             }
+             return rank_to_GPU
\ No newline at end of file
--- a/Golden_Repo/p/PyTorch/PyTorch-1.7.0_fix_typing_python38.patch
+++ b/Golden_Repo/p/PyTorch/PyTorch-1.7.0_fix_typing_python38.patch
+--- test/jit/test_recursive_script.py.orig	2020-11-11 18:40:22.959601697 +0100
+++ test/jit/test_recursive_script.py	2020-11-11 18:40:44.718092798 +0100
+@@ -2,7 +2,6 @@
+ import os
+ import sys
+ import typing
+-import typing_extensions
+ from typing import List, Dict, Optional, Tuple
+ import torch
+@@ -174,7 +173,7 @@
+         self.checkModule(M1(), (torch.randn(2, 2),))
+         class M2(torch.nn.Module):
+-            x : typing_extensions.Final[int]
+            x : typing.Final[int]
+             def __init__(self):
+                 super().__init__()
--- a/Golden_Repo/p/PyTorch/PyTorch-check-cpp-extension.py
+++ b/Golden_Repo/p/PyTorch/PyTorch-check-cpp-extension.py
+#!/usr/bin/env python
+# Verify that PyTorch can JIT compile C++ extensions
+# This requires at least Ninja and a working C++ compiler, preferably GCC
+#
+# Heavily based on the PyTorch tutorial for C++ extensions
+# Author: Alexander Grund (TU Dresden)
+from torch.utils.cpp_extension import load_inline
+cpp_source = "torch::Tensor test_func(torch::Tensor x) { return x; }"
+module = load_inline(name='inline_extension',
+                     cpp_sources=cpp_source,
+                     functions=['test_func'])
+assert module