diff --git a/Golden_Repo/p/PyTorch/PyTorch-1.6.0_fix-test-dataloader-fixed-affinity.patch b/Golden_Repo/p/PyTorch/PyTorch-1.6.0_fix-test-dataloader-fixed-affinity.patch new file mode 100644 index 0000000000000000000000000000000000000000..39e4d9883bdcafff1586aa278a08a41a8f786527 --- /dev/null +++ b/Golden_Repo/p/PyTorch/PyTorch-1.6.0_fix-test-dataloader-fixed-affinity.patch @@ -0,0 +1,48 @@ +From c7c7460fd3a49a9d289394b80d9ecf61898edf49 Mon Sep 17 00:00:00 2001 +From: Alexander Grund <alexander.grund@tu-dresden.de> +Date: Wed, 9 Sep 2020 08:47:03 +0200 +Subject: [PATCH] Choose test affinity based on current affinity + +--- + test/test_dataloader.py | 18 ++++++++++++------ + 1 file changed, 12 insertions(+), 6 deletions(-) + +diff --git a/test/test_dataloader.py b/test/test_dataloader.py +index ca0c9e6cb511f..745942bcf01f2 100644 +--- a/test/test_dataloader.py ++++ b/test/test_dataloader.py +@@ -2128,22 +2128,28 @@ def __iter__(self): + after = os.sched_getaffinity(0) + return iter(after) + +- +-def worker_set_affinity(_): +- os.sched_setaffinity(0, [2]) +- +- + @unittest.skipIf( + not hasattr(os, 'sched_setaffinity'), + "os.sched_setaffinity is not available") + class TestSetAffinity(TestCase): + def test_set_affinity_in_worker_init(self): ++ # Query the current affinity mask to avoid setting a disallowed one ++ old_affinity = os.sched_getaffinity(0) ++ if not old_affinity: ++ self.skipTest("No affinity information") ++ # Choose any ++ expected_affinity = list(old_affinity)[-1] ++ ++ def worker_set_affinity(_): ++ os.sched_setaffinity(0, [expected_affinity]) ++ ++ + dataset = SetAffinityDataset() + + dataloader = torch.utils.data.DataLoader( + dataset, num_workers=2, worker_init_fn=worker_set_affinity) + for sample in dataloader: +- self.assertEqual(sample, [2]) ++ self.assertEqual(sample, [expected_affinity]) + + + diff --git a/Golden_Repo/p/PyTorch/PyTorch-1.7.0-gcccoremkl-2020-Python-3.8.5.eb b/Golden_Repo/p/PyTorch/PyTorch-1.7.0-gcccoremkl-2020-Python-3.8.5.eb new file mode 100644 index 0000000000000000000000000000000000000000..1422b8f3f7a65ccf1e301c22b9494ed34fcd0ca3 --- /dev/null +++ b/Golden_Repo/p/PyTorch/PyTorch-1.7.0-gcccoremkl-2020-Python-3.8.5.eb @@ -0,0 +1,291 @@ +name = 'PyTorch' +version = '1.7.0' +versionsuffix = '-Python-%(pyver)s' + +homepage = 'https://pytorch.org/' +description = """Tensors and Dynamic neural networks in Python with strong GPU acceleration. +PyTorch is a deep learning framework that puts Python first.""" + +toolchain = {'name': 'gcccoremkl', 'version': '9.3.0-2020.2.254'} + +site_contacts = 'Alexandre Strube <a.strube@fz-juelich.de>' + +osdependencies = [OS_PKG_IBVERBS_DEV] +local_cudaver = '11.0' +parallel = 128 # apparently this is solemny ignored + +builddependencies = [ + ('CMake', '3.18.0'), + ('hypothesis', '4.44.2', versionsuffix), +] + +dependencies = [ + ('Ninja', '1.10.0'), + ('Python', '3.8.5'), + ('protobuf', '3.13.0'), + ('protobuf-python', '3.13.0', versionsuffix), + ('pybind11', '2.5.0', versionsuffix), + ('SciPy-Stack', '2020', versionsuffix, ('gcccoremkl', '9.3.0-2020.2.254')), + ('MPFR', '4.1.0'), + ('GMP', '6.2.0'), + ('numactl', '2.0.13', '', True), + ('FFmpeg', '4.3.1'), + ('cuDNN', '8.0.2.39', '-CUDA-%s' % local_cudaver, True), + ('magma', '2.5.4'), + ('NCCL', '2.8.3-1', '-CUDA-%s' % local_cudaver), + ('LLVM', '10.0.1'), + ('future', '0.18.2', versionsuffix), + ('libvpx', '1.9.0'), +] + + +# default CUDA compute capabilities to use (override via --cuda-compute-capabilities) +cuda_compute_capabilities = ['7.0', '7.5', '8.0'] + + +# PyTorch pulls in a bunch of submodules that are not releases. +# We download the submodule revisions from their repos. +# The list is generated by easybuild-framework/easybuild/scripts/createSubmoduleDeps.sh +local_extract_cmd_pattern = 'tar -C pytorch-%%(version)s/third_party/%s --strip-components=1 -xf %%s' + +local_cudaver = '11.0' + +source_urls = ['https://github.com/pytorch/pytorch/archive'] + +sources = [ + 'v%(version)s.tar.gz', # PyTorch + { + 'source_urls': ['https://github.com/Maratyszcza/FP16/archive'], + 'download_filename': '4dfe081cf6bcd15db339cf2680b9281b8451eeb3.tar.gz', + 'filename': 'FP16-20200514.tar.gz', + 'extract_cmd': local_extract_cmd_pattern % 'FP16', + }, + { + 'source_urls': ['https://github.com/Maratyszcza/FXdiv/archive'], + 'download_filename': 'b408327ac2a15ec3e43352421954f5b1967701d1.tar.gz', + 'filename': 'FXdiv-20200417.tar.gz', + 'extract_cmd': local_extract_cmd_pattern % 'FXdiv', + }, + { + 'source_urls': ['https://github.com/Maratyszcza/NNPACK/archive'], + 'download_filename': '24b55303f5cf65d75844714513a0d1b1409809bd.tar.gz', + 'filename': 'NNPACK-20191007.tar.gz', + 'extract_cmd': local_extract_cmd_pattern % 'NNPACK', + }, + { + 'source_urls': ['https://github.com/pytorch/QNNPACK/archive'], + 'download_filename': '7d2a4e9931a82adc3814275b6219a03e24e36b4c.tar.gz', + 'filename': 'QNNPACK-20190828.tar.gz', + 'extract_cmd': local_extract_cmd_pattern % 'QNNPACK', + }, + { + 'source_urls': ['https://github.com/google/XNNPACK/archive'], + 'download_filename': '1b354636b5942826547055252f3b359b54acff95.tar.gz', + 'filename': 'XNNPACK-20200323.tar.gz', + 'extract_cmd': local_extract_cmd_pattern % 'XNNPACK', + }, + { + 'source_urls': ['https://github.com/google/benchmark/archive'], + 'download_filename': '505be96ab23056580a3a2315abba048f4428b04e.tar.gz', + 'filename': 'benchmark-20180606.tar.gz', + 'extract_cmd': local_extract_cmd_pattern % 'benchmark', + }, + { + 'source_urls': ['https://github.com/pytorch/cpuinfo/archive'], + 'download_filename': '63b254577ed77a8004a9be6ac707f3dccc4e1fd9.tar.gz', + 'filename': 'cpuinfo-20200611.tar.gz', + 'extract_cmd': local_extract_cmd_pattern % 'cpuinfo', + }, + { + 'source_urls': ['https://github.com/NVlabs/cub/archive'], + 'download_filename': 'd106ddb991a56c3df1b6d51b2409e36ba8181ce4.tar.gz', + 'filename': 'cub-20200512.tar.gz', + 'extract_cmd': local_extract_cmd_pattern % 'cub', + }, + { + 'source_urls': ['https://github.com/eigenteam/eigen-git-mirror/archive'], + 'download_filename': 'd41dc4dd74acce21fb210e7625d5d135751fa9e5.tar.gz', + 'filename': 'eigen-20190125.tar.gz', + 'extract_cmd': local_extract_cmd_pattern % 'eigen', + }, + { + 'source_urls': ['https://github.com/pytorch/fbgemm/archive'], + 'download_filename': '1d710393d5b7588f5de3b83f51c22bbddf095229.tar.gz', + 'filename': 'fbgemm-20200914.tar.gz', + 'extract_cmd': local_extract_cmd_pattern % 'fbgemm', + }, + { + 'source_urls': ['https://github.com/asmjit/asmjit/archive'], + 'download_filename': '9057aa30b620f0662ff51e2230c126a345063064.tar.gz', + 'filename': 'asmjit-20200429.tar.gz', + 'extract_cmd': local_extract_cmd_pattern % 'fbgemm/third_party/asmjit', + }, + { + 'source_urls': ['https://github.com/pytorch/cpuinfo/archive'], + 'download_filename': 'd5e37adf1406cf899d7d9ec1d317c47506ccb970.tar.gz', + 'filename': 'cpuinfo-20190201.tar.gz', + 'extract_cmd': local_extract_cmd_pattern % 'fbgemm/third_party/cpuinfo', + }, + { + 'source_urls': ['https://github.com/google/googletest/archive'], + 'download_filename': '0fc5466dbb9e623029b1ada539717d10bd45e99e.tar.gz', + 'filename': 'googletest-20180925.tar.gz', + 'extract_cmd': local_extract_cmd_pattern % 'fbgemm/third_party/googletest', + }, + { + 'source_urls': ['https://github.com/fmtlib/fmt/archive'], + 'download_filename': 'cd4af11efc9c622896a3e4cb599fa28668ca3d05.tar.gz', + 'filename': 'fmt-20200806.tar.gz', + 'extract_cmd': local_extract_cmd_pattern % 'fmt', + }, + { + 'source_urls': ['https://github.com/houseroad/foxi/archive'], + 'download_filename': '4aba696ec8f31794fd42880346dc586486205e0a.tar.gz', + 'filename': 'foxi-20200922.tar.gz', + 'extract_cmd': local_extract_cmd_pattern % 'foxi', + }, + { + 'source_urls': ['https://github.com/google/gemmlowp/archive'], + 'download_filename': '3fb5c176c17c765a3492cd2f0321b0dab712f350.tar.gz', + 'filename': 'gemmlowp-20181126.tar.gz', + 'extract_cmd': local_extract_cmd_pattern % 'gemmlowp/gemmlowp', + }, + { + 'source_urls': ['https://github.com/facebookincubator/gloo/archive'], + 'download_filename': '3dc0328fe6a9d47bd47c0c6ca145a0d8a21845c6.tar.gz', + 'filename': 'gloo-20200918.tar.gz', + 'extract_cmd': local_extract_cmd_pattern % 'gloo', + }, + { + 'source_urls': ['https://github.com/google/googletest/archive'], + 'download_filename': '2fe3bd994b3189899d93f1d5a881e725e046fdc2.tar.gz', + 'filename': 'googletest-20180831.tar.gz', + 'extract_cmd': local_extract_cmd_pattern % 'googletest', + }, + { + 'source_urls': ['https://github.com/intel/ideep/archive'], + 'download_filename': 'ba885200dbbc1f144c7b58eba487378eb324f281.tar.gz', + 'filename': 'ideep-20200915.tar.gz', + 'extract_cmd': local_extract_cmd_pattern % 'ideep', + }, + { + 'source_urls': ['https://github.com/intel/mkl-dnn/archive'], + 'download_filename': '5ef631a030a6f73131c77892041042805a06064f.tar.gz', + 'filename': 'mkl-dnn-20200909.tar.gz', + 'extract_cmd': local_extract_cmd_pattern % 'ideep/mkl-dnn', + }, + { + 'source_urls': ['https://github.com/onnx/onnx/archive'], + 'download_filename': 'a82c6a7010e2e332d8f74ad5b0c726fd47c85376.tar.gz', + 'filename': 'onnx-20200531.tar.gz', + 'extract_cmd': local_extract_cmd_pattern % 'onnx', + }, + { + 'source_urls': ['https://github.com/google/benchmark/archive'], + 'download_filename': 'e776aa0275e293707b6a0901e0e8d8a8a3679508.tar.gz', + 'filename': 'benchmark-20180525.tar.gz', + 'extract_cmd': local_extract_cmd_pattern % 'onnx/third_party/benchmark', + }, + { + 'source_urls': ['https://github.com/onnx/onnx-tensorrt/archive'], + 'download_filename': 'c153211418a7c57ce071d9ce2a41f8d1c85a878f.tar.gz', + 'filename': 'onnx-tensorrt-20190916.tar.gz', + 'extract_cmd': local_extract_cmd_pattern % 'onnx-tensorrt', + }, + { + 'source_urls': ['https://github.com/Maratyszcza/psimd/archive'], + 'download_filename': '072586a71b55b7f8c584153d223e95687148a900.tar.gz', + 'filename': 'psimd-20200517.tar.gz', + 'extract_cmd': local_extract_cmd_pattern % 'psimd', + }, + { + 'source_urls': ['https://github.com/Maratyszcza/pthreadpool/archive'], + 'download_filename': '029c88620802e1361ccf41d1970bd5b07fd6b7bb.tar.gz', + 'filename': 'pthreadpool-20200615.tar.gz', + 'extract_cmd': local_extract_cmd_pattern % 'pthreadpool', + }, + { + 'source_urls': ['https://github.com/Maratyszcza/PeachPy/archive'], + 'download_filename': '07d8fde8ac45d7705129475c0f94ed8925b93473.tar.gz', + 'filename': 'PeachPy-20180218.tar.gz', + 'extract_cmd': local_extract_cmd_pattern % 'python-peachpy', + }, + { + 'source_urls': ['https://github.com/shibatch/sleef/archive'], + 'download_filename': '7f523de651585fe25cade462efccca647dcc8d02.tar.gz', + 'filename': 'sleef-20190730.tar.gz', + 'extract_cmd': local_extract_cmd_pattern % 'sleef', + }, + { + 'source_urls': ['https://github.com/01org/tbb/archive'], + 'download_filename': 'a51a90bc609bb73db8ea13841b5cf7aa4344d4a9.tar.gz', + 'filename': 'tbb-20181009.tar.gz', + 'extract_cmd': local_extract_cmd_pattern % 'tbb', + }, + { + 'source_urls': ['https://github.com/pytorch/tensorpipe/archive'], + 'download_filename': '95ff9319161fcdb3c674d2bb63fac3e94095b343.tar.gz', + 'filename': 'tensorpipe-20200928.tar.gz', + 'extract_cmd': local_extract_cmd_pattern % 'tensorpipe', + }, + { + 'source_urls': ['https://github.com/google/googletest/archive'], + 'download_filename': '2fe3bd994b3189899d93f1d5a881e725e046fdc2.tar.gz', + 'filename': 'googletest-20180831.tar.gz', + 'extract_cmd': local_extract_cmd_pattern % 'tensorpipe/third_party/googletest', + }, + { + 'source_urls': ['https://github.com/google/libnop/archive'], + 'download_filename': 'aa95422ea8c409e3f078d2ee7708a5f59a8b9fa2.tar.gz', + 'filename': 'libnop-20200723.tar.gz', + 'extract_cmd': local_extract_cmd_pattern % 'tensorpipe/third_party/libnop', + }, + { + 'source_urls': ['https://github.com/libuv/libuv/archive'], + 'download_filename': '02a9e1be252b623ee032a3137c0b0c94afbe6809.tar.gz', + 'filename': 'libuv-20200419.tar.gz', + 'extract_cmd': local_extract_cmd_pattern % 'tensorpipe/third_party/libuv', + }, +] +patches = [ + 'PyTorch-1.6.0_fix-test-dataloader-fixed-affinity.patch', + 'PyTorch-1.7.0_fix_test_DistributedDataParallel.patch', + 'PyTorch-1.7.0_fix_typing_python38.patch', + 'PyTorch-1.7.0_fix_remove_backports.patch', +] + +excluded_tests = { + '': [ + # Fails on HDFML. Probably needs a GPU and a network - needs NCCL + 'distributed/test_distributed_fork', + # Fails on HDFML. + 'distributed/test_distributed_spawn', + # Fails on JUWELS (cluster) with SIGXCPU and on JUWELSBOOSTER + 'test_foreach', + 'test_xnnpack_integration', + # Fails on JUSUF + 'distributed/rpc/test_process_group_agent', + 'distributed/rpc/test_tensorpipe_agent', + 'test_autograd', + 'test_jit', + # Disabling all distributed tests because I don't have the whole year. Each test takes 2 hours + 'distributed/nn/jit/test_instantiator', + 'distributed/rpc/test_faulty_agent', + 'distributed/rpc/test_process_group_agent', + 'distributed/rpc/test_tensorpipe_agent', + # This test fails everywhere: https://github.com/pytorch/pytorch/issues/41242 + # 'test_cpp_extensions_jit', + # Throws illegal memory access due to float16: https://github.com/pytorch/pytorch/issues/41340 + # 'test_torch', + # Potentially problematic save/load issue with test_lstm on only some machines. Tell users to verify save&load! + # https://github.com/pytorch/pytorch/issues/43209 + # 'test_quantization', + ], +} + +runtest = 'cd test && %(python)s run_test.py --verbose %(excluded_tests)s' + +sanity_check_commands = ["python -c 'import caffe2.python'"] +tests = ['PyTorch-check-cpp-extension.py'] + +moduleclass = 'devel' diff --git a/Golden_Repo/p/PyTorch/PyTorch-1.7.0_fix_remove_backports.patch b/Golden_Repo/p/PyTorch/PyTorch-1.7.0_fix_remove_backports.patch new file mode 100644 index 0000000000000000000000000000000000000000..8449d476f473e8b86edb86591f8512433a8f5482 --- /dev/null +++ b/Golden_Repo/p/PyTorch/PyTorch-1.7.0_fix_remove_backports.patch @@ -0,0 +1,11 @@ +--- setup.py.orig 2020-11-12 14:13:54.727306249 +0100 ++++ setup.py 2020-11-12 14:14:42.105485851 +0100 +@@ -337,7 +337,7 @@ + ################################################################################ + + # the list of runtime dependencies required by this built package +-install_requires = ['future', 'typing_extensions', 'dataclasses'] ++install_requires = ['future'] + + missing_pydep = ''' + Missing build dependency: Unable to `import {importname}`. diff --git a/Golden_Repo/p/PyTorch/PyTorch-1.7.0_fix_test_DistributedDataParallel.patch b/Golden_Repo/p/PyTorch/PyTorch-1.7.0_fix_test_DistributedDataParallel.patch new file mode 100644 index 0000000000000000000000000000000000000000..18c874c475b985602f88a4113851df806107edce --- /dev/null +++ b/Golden_Repo/p/PyTorch/PyTorch-1.7.0_fix_test_DistributedDataParallel.patch @@ -0,0 +1,40 @@ +From 4ee880e1645da7c581a04de6cfe30d911f659f57 Mon Sep 17 00:00:00 2001 +From: Alexander Grund <alexander.grund@tu-dresden.de> +Date: Thu, 15 Oct 2020 14:30:03 +0200 +Subject: [PATCH] Distribute GPUs in round robin mode for distributed_test + +The ProcessGroupNCCL::barrier implementation assumes that when +1 GPU/rank is used the GPU-Index equals the rank. Due to NCCL +communicator reuse this then leads to rank 0 using the (kinda) +temporary communicator while the other processes might use other GPUs +leading to them trying to create a new communicator and waiting for +rank 0 until that creates a new (potentially unrelated) one. + +See #46248 for details +--- + torch/testing/_internal/distributed/distributed_test.py | 6 ++---- + 1 file changed, 2 insertions(+), 4 deletions(-) + +diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py +index ccaccbda529aa..4d7c23b613d7c 100644 +--- a/torch/testing/_internal/distributed/distributed_test.py ++++ b/torch/testing/_internal/distributed/distributed_test.py +@@ -362,16 +362,14 @@ def _init_multigpu_helper(self): + """ + nGPUs = torch.cuda.device_count() + world_size = dist.get_world_size() +- visible_devices = range(nGPUs) + + if BACKEND == "nccl": + apply_hack_for_nccl() + + nGPUs_per_process = nGPUs // world_size + rank_to_GPU = { +- i: list( +- visible_devices[i * nGPUs_per_process: (i + 1) * nGPUs_per_process] +- ) ++ # Each rank has to get the GPU with the index equal to its rank ++ i: [i + gpu_num * world_size for gpu_num in range(nGPUs_per_process)] + for i in range(world_size) + } + return rank_to_GPU \ No newline at end of file diff --git a/Golden_Repo/p/PyTorch/PyTorch-1.7.0_fix_typing_python38.patch b/Golden_Repo/p/PyTorch/PyTorch-1.7.0_fix_typing_python38.patch new file mode 100644 index 0000000000000000000000000000000000000000..da556eaef46876c75aae77b2f50a0dc5d55e592e --- /dev/null +++ b/Golden_Repo/p/PyTorch/PyTorch-1.7.0_fix_typing_python38.patch @@ -0,0 +1,19 @@ +--- test/jit/test_recursive_script.py.orig 2020-11-11 18:40:22.959601697 +0100 ++++ test/jit/test_recursive_script.py 2020-11-11 18:40:44.718092798 +0100 +@@ -2,7 +2,6 @@ + import os + import sys + import typing +-import typing_extensions + from typing import List, Dict, Optional, Tuple + + import torch +@@ -174,7 +173,7 @@ + self.checkModule(M1(), (torch.randn(2, 2),)) + + class M2(torch.nn.Module): +- x : typing_extensions.Final[int] ++ x : typing.Final[int] + + def __init__(self): + super().__init__() diff --git a/Golden_Repo/p/PyTorch/PyTorch-check-cpp-extension.py b/Golden_Repo/p/PyTorch/PyTorch-check-cpp-extension.py new file mode 100755 index 0000000000000000000000000000000000000000..0a8f6d3e6c59ed07b073470e1a7db54548099e37 --- /dev/null +++ b/Golden_Repo/p/PyTorch/PyTorch-check-cpp-extension.py @@ -0,0 +1,16 @@ +#!/usr/bin/env python + +# Verify that PyTorch can JIT compile C++ extensions +# This requires at least Ninja and a working C++ compiler, preferably GCC +# +# Heavily based on the PyTorch tutorial for C++ extensions +# Author: Alexander Grund (TU Dresden) + +from torch.utils.cpp_extension import load_inline + +cpp_source = "torch::Tensor test_func(torch::Tensor x) { return x; }" + +module = load_inline(name='inline_extension', + cpp_sources=cpp_source, + functions=['test_func']) +assert module