Skip to content
Snippets Groups Projects
Commit 0bc8a6fc authored by Alexandre Strube's avatar Alexandre Strube
Browse files

test

parent e8d4f180
Branches
No related tags found
No related merge requests found
From c7c7460fd3a49a9d289394b80d9ecf61898edf49 Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Wed, 9 Sep 2020 08:47:03 +0200
Subject: [PATCH] Choose test affinity based on current affinity
---
test/test_dataloader.py | 18 ++++++++++++------
1 file changed, 12 insertions(+), 6 deletions(-)
diff --git a/test/test_dataloader.py b/test/test_dataloader.py
index ca0c9e6cb511f..745942bcf01f2 100644
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@@ -2128,22 +2128,28 @@ def __iter__(self):
after = os.sched_getaffinity(0)
return iter(after)
-
-def worker_set_affinity(_):
- os.sched_setaffinity(0, [2])
-
-
@unittest.skipIf(
not hasattr(os, 'sched_setaffinity'),
"os.sched_setaffinity is not available")
class TestSetAffinity(TestCase):
def test_set_affinity_in_worker_init(self):
+ # Query the current affinity mask to avoid setting a disallowed one
+ old_affinity = os.sched_getaffinity(0)
+ if not old_affinity:
+ self.skipTest("No affinity information")
+ # Choose any
+ expected_affinity = list(old_affinity)[-1]
+
+ def worker_set_affinity(_):
+ os.sched_setaffinity(0, [expected_affinity])
+
+
dataset = SetAffinityDataset()
dataloader = torch.utils.data.DataLoader(
dataset, num_workers=2, worker_init_fn=worker_set_affinity)
for sample in dataloader:
- self.assertEqual(sample, [2])
+ self.assertEqual(sample, [expected_affinity])
name = 'PyTorch'
version = '1.7.0'
versionsuffix = '-Python-%(pyver)s'
homepage = 'https://pytorch.org/'
description = """Tensors and Dynamic neural networks in Python with strong GPU acceleration.
PyTorch is a deep learning framework that puts Python first."""
toolchain = {'name': 'gcccoremkl', 'version': '9.3.0-2020.2.254'}
site_contacts = 'Alexandre Strube <a.strube@fz-juelich.de>'
osdependencies = [OS_PKG_IBVERBS_DEV]
local_cudaver = '11.0'
parallel = 128 # apparently this is solemny ignored
builddependencies = [
('CMake', '3.18.0'),
('hypothesis', '4.44.2', versionsuffix),
]
dependencies = [
('Ninja', '1.10.0'),
('Python', '3.8.5'),
('protobuf', '3.13.0'),
('protobuf-python', '3.13.0', versionsuffix),
('pybind11', '2.5.0', versionsuffix),
('SciPy-Stack', '2020', versionsuffix, ('gcccoremkl', '9.3.0-2020.2.254')),
('MPFR', '4.1.0'),
('GMP', '6.2.0'),
('numactl', '2.0.13', '', True),
('FFmpeg', '4.3.1'),
('cuDNN', '8.0.2.39', '-CUDA-%s' % local_cudaver, True),
('magma', '2.5.4'),
('NCCL', '2.8.3-1', '-CUDA-%s' % local_cudaver),
('LLVM', '10.0.1'),
('future', '0.18.2', versionsuffix),
('libvpx', '1.9.0'),
]
# default CUDA compute capabilities to use (override via --cuda-compute-capabilities)
cuda_compute_capabilities = ['7.0', '7.5', '8.0']
# PyTorch pulls in a bunch of submodules that are not releases.
# We download the submodule revisions from their repos.
# The list is generated by easybuild-framework/easybuild/scripts/createSubmoduleDeps.sh
local_extract_cmd_pattern = 'tar -C pytorch-%%(version)s/third_party/%s --strip-components=1 -xf %%s'
local_cudaver = '11.0'
source_urls = ['https://github.com/pytorch/pytorch/archive']
sources = [
'v%(version)s.tar.gz', # PyTorch
{
'source_urls': ['https://github.com/Maratyszcza/FP16/archive'],
'download_filename': '4dfe081cf6bcd15db339cf2680b9281b8451eeb3.tar.gz',
'filename': 'FP16-20200514.tar.gz',
'extract_cmd': local_extract_cmd_pattern % 'FP16',
},
{
'source_urls': ['https://github.com/Maratyszcza/FXdiv/archive'],
'download_filename': 'b408327ac2a15ec3e43352421954f5b1967701d1.tar.gz',
'filename': 'FXdiv-20200417.tar.gz',
'extract_cmd': local_extract_cmd_pattern % 'FXdiv',
},
{
'source_urls': ['https://github.com/Maratyszcza/NNPACK/archive'],
'download_filename': '24b55303f5cf65d75844714513a0d1b1409809bd.tar.gz',
'filename': 'NNPACK-20191007.tar.gz',
'extract_cmd': local_extract_cmd_pattern % 'NNPACK',
},
{
'source_urls': ['https://github.com/pytorch/QNNPACK/archive'],
'download_filename': '7d2a4e9931a82adc3814275b6219a03e24e36b4c.tar.gz',
'filename': 'QNNPACK-20190828.tar.gz',
'extract_cmd': local_extract_cmd_pattern % 'QNNPACK',
},
{
'source_urls': ['https://github.com/google/XNNPACK/archive'],
'download_filename': '1b354636b5942826547055252f3b359b54acff95.tar.gz',
'filename': 'XNNPACK-20200323.tar.gz',
'extract_cmd': local_extract_cmd_pattern % 'XNNPACK',
},
{
'source_urls': ['https://github.com/google/benchmark/archive'],
'download_filename': '505be96ab23056580a3a2315abba048f4428b04e.tar.gz',
'filename': 'benchmark-20180606.tar.gz',
'extract_cmd': local_extract_cmd_pattern % 'benchmark',
},
{
'source_urls': ['https://github.com/pytorch/cpuinfo/archive'],
'download_filename': '63b254577ed77a8004a9be6ac707f3dccc4e1fd9.tar.gz',
'filename': 'cpuinfo-20200611.tar.gz',
'extract_cmd': local_extract_cmd_pattern % 'cpuinfo',
},
{
'source_urls': ['https://github.com/NVlabs/cub/archive'],
'download_filename': 'd106ddb991a56c3df1b6d51b2409e36ba8181ce4.tar.gz',
'filename': 'cub-20200512.tar.gz',
'extract_cmd': local_extract_cmd_pattern % 'cub',
},
{
'source_urls': ['https://github.com/eigenteam/eigen-git-mirror/archive'],
'download_filename': 'd41dc4dd74acce21fb210e7625d5d135751fa9e5.tar.gz',
'filename': 'eigen-20190125.tar.gz',
'extract_cmd': local_extract_cmd_pattern % 'eigen',
},
{
'source_urls': ['https://github.com/pytorch/fbgemm/archive'],
'download_filename': '1d710393d5b7588f5de3b83f51c22bbddf095229.tar.gz',
'filename': 'fbgemm-20200914.tar.gz',
'extract_cmd': local_extract_cmd_pattern % 'fbgemm',
},
{
'source_urls': ['https://github.com/asmjit/asmjit/archive'],
'download_filename': '9057aa30b620f0662ff51e2230c126a345063064.tar.gz',
'filename': 'asmjit-20200429.tar.gz',
'extract_cmd': local_extract_cmd_pattern % 'fbgemm/third_party/asmjit',
},
{
'source_urls': ['https://github.com/pytorch/cpuinfo/archive'],
'download_filename': 'd5e37adf1406cf899d7d9ec1d317c47506ccb970.tar.gz',
'filename': 'cpuinfo-20190201.tar.gz',
'extract_cmd': local_extract_cmd_pattern % 'fbgemm/third_party/cpuinfo',
},
{
'source_urls': ['https://github.com/google/googletest/archive'],
'download_filename': '0fc5466dbb9e623029b1ada539717d10bd45e99e.tar.gz',
'filename': 'googletest-20180925.tar.gz',
'extract_cmd': local_extract_cmd_pattern % 'fbgemm/third_party/googletest',
},
{
'source_urls': ['https://github.com/fmtlib/fmt/archive'],
'download_filename': 'cd4af11efc9c622896a3e4cb599fa28668ca3d05.tar.gz',
'filename': 'fmt-20200806.tar.gz',
'extract_cmd': local_extract_cmd_pattern % 'fmt',
},
{
'source_urls': ['https://github.com/houseroad/foxi/archive'],
'download_filename': '4aba696ec8f31794fd42880346dc586486205e0a.tar.gz',
'filename': 'foxi-20200922.tar.gz',
'extract_cmd': local_extract_cmd_pattern % 'foxi',
},
{
'source_urls': ['https://github.com/google/gemmlowp/archive'],
'download_filename': '3fb5c176c17c765a3492cd2f0321b0dab712f350.tar.gz',
'filename': 'gemmlowp-20181126.tar.gz',
'extract_cmd': local_extract_cmd_pattern % 'gemmlowp/gemmlowp',
},
{
'source_urls': ['https://github.com/facebookincubator/gloo/archive'],
'download_filename': '3dc0328fe6a9d47bd47c0c6ca145a0d8a21845c6.tar.gz',
'filename': 'gloo-20200918.tar.gz',
'extract_cmd': local_extract_cmd_pattern % 'gloo',
},
{
'source_urls': ['https://github.com/google/googletest/archive'],
'download_filename': '2fe3bd994b3189899d93f1d5a881e725e046fdc2.tar.gz',
'filename': 'googletest-20180831.tar.gz',
'extract_cmd': local_extract_cmd_pattern % 'googletest',
},
{
'source_urls': ['https://github.com/intel/ideep/archive'],
'download_filename': 'ba885200dbbc1f144c7b58eba487378eb324f281.tar.gz',
'filename': 'ideep-20200915.tar.gz',
'extract_cmd': local_extract_cmd_pattern % 'ideep',
},
{
'source_urls': ['https://github.com/intel/mkl-dnn/archive'],
'download_filename': '5ef631a030a6f73131c77892041042805a06064f.tar.gz',
'filename': 'mkl-dnn-20200909.tar.gz',
'extract_cmd': local_extract_cmd_pattern % 'ideep/mkl-dnn',
},
{
'source_urls': ['https://github.com/onnx/onnx/archive'],
'download_filename': 'a82c6a7010e2e332d8f74ad5b0c726fd47c85376.tar.gz',
'filename': 'onnx-20200531.tar.gz',
'extract_cmd': local_extract_cmd_pattern % 'onnx',
},
{
'source_urls': ['https://github.com/google/benchmark/archive'],
'download_filename': 'e776aa0275e293707b6a0901e0e8d8a8a3679508.tar.gz',
'filename': 'benchmark-20180525.tar.gz',
'extract_cmd': local_extract_cmd_pattern % 'onnx/third_party/benchmark',
},
{
'source_urls': ['https://github.com/onnx/onnx-tensorrt/archive'],
'download_filename': 'c153211418a7c57ce071d9ce2a41f8d1c85a878f.tar.gz',
'filename': 'onnx-tensorrt-20190916.tar.gz',
'extract_cmd': local_extract_cmd_pattern % 'onnx-tensorrt',
},
{
'source_urls': ['https://github.com/Maratyszcza/psimd/archive'],
'download_filename': '072586a71b55b7f8c584153d223e95687148a900.tar.gz',
'filename': 'psimd-20200517.tar.gz',
'extract_cmd': local_extract_cmd_pattern % 'psimd',
},
{
'source_urls': ['https://github.com/Maratyszcza/pthreadpool/archive'],
'download_filename': '029c88620802e1361ccf41d1970bd5b07fd6b7bb.tar.gz',
'filename': 'pthreadpool-20200615.tar.gz',
'extract_cmd': local_extract_cmd_pattern % 'pthreadpool',
},
{
'source_urls': ['https://github.com/Maratyszcza/PeachPy/archive'],
'download_filename': '07d8fde8ac45d7705129475c0f94ed8925b93473.tar.gz',
'filename': 'PeachPy-20180218.tar.gz',
'extract_cmd': local_extract_cmd_pattern % 'python-peachpy',
},
{
'source_urls': ['https://github.com/shibatch/sleef/archive'],
'download_filename': '7f523de651585fe25cade462efccca647dcc8d02.tar.gz',
'filename': 'sleef-20190730.tar.gz',
'extract_cmd': local_extract_cmd_pattern % 'sleef',
},
{
'source_urls': ['https://github.com/01org/tbb/archive'],
'download_filename': 'a51a90bc609bb73db8ea13841b5cf7aa4344d4a9.tar.gz',
'filename': 'tbb-20181009.tar.gz',
'extract_cmd': local_extract_cmd_pattern % 'tbb',
},
{
'source_urls': ['https://github.com/pytorch/tensorpipe/archive'],
'download_filename': '95ff9319161fcdb3c674d2bb63fac3e94095b343.tar.gz',
'filename': 'tensorpipe-20200928.tar.gz',
'extract_cmd': local_extract_cmd_pattern % 'tensorpipe',
},
{
'source_urls': ['https://github.com/google/googletest/archive'],
'download_filename': '2fe3bd994b3189899d93f1d5a881e725e046fdc2.tar.gz',
'filename': 'googletest-20180831.tar.gz',
'extract_cmd': local_extract_cmd_pattern % 'tensorpipe/third_party/googletest',
},
{
'source_urls': ['https://github.com/google/libnop/archive'],
'download_filename': 'aa95422ea8c409e3f078d2ee7708a5f59a8b9fa2.tar.gz',
'filename': 'libnop-20200723.tar.gz',
'extract_cmd': local_extract_cmd_pattern % 'tensorpipe/third_party/libnop',
},
{
'source_urls': ['https://github.com/libuv/libuv/archive'],
'download_filename': '02a9e1be252b623ee032a3137c0b0c94afbe6809.tar.gz',
'filename': 'libuv-20200419.tar.gz',
'extract_cmd': local_extract_cmd_pattern % 'tensorpipe/third_party/libuv',
},
]
patches = [
'PyTorch-1.6.0_fix-test-dataloader-fixed-affinity.patch',
'PyTorch-1.7.0_fix_test_DistributedDataParallel.patch',
'PyTorch-1.7.0_fix_typing_python38.patch',
'PyTorch-1.7.0_fix_remove_backports.patch',
]
excluded_tests = {
'': [
# Fails on HDFML. Probably needs a GPU and a network - needs NCCL
'distributed/test_distributed_fork',
# Fails on HDFML.
'distributed/test_distributed_spawn',
# Fails on JUWELS (cluster) with SIGXCPU and on JUWELSBOOSTER
'test_foreach',
'test_xnnpack_integration',
# Fails on JUSUF
'distributed/rpc/test_process_group_agent',
'distributed/rpc/test_tensorpipe_agent',
'test_autograd',
'test_jit',
# Disabling all distributed tests because I don't have the whole year. Each test takes 2 hours
'distributed/nn/jit/test_instantiator',
'distributed/rpc/test_faulty_agent',
'distributed/rpc/test_process_group_agent',
'distributed/rpc/test_tensorpipe_agent',
# This test fails everywhere: https://github.com/pytorch/pytorch/issues/41242
# 'test_cpp_extensions_jit',
# Throws illegal memory access due to float16: https://github.com/pytorch/pytorch/issues/41340
# 'test_torch',
# Potentially problematic save/load issue with test_lstm on only some machines. Tell users to verify save&load!
# https://github.com/pytorch/pytorch/issues/43209
# 'test_quantization',
],
}
runtest = 'cd test && %(python)s run_test.py --verbose %(excluded_tests)s'
sanity_check_commands = ["python -c 'import caffe2.python'"]
tests = ['PyTorch-check-cpp-extension.py']
moduleclass = 'devel'
--- setup.py.orig 2020-11-12 14:13:54.727306249 +0100
+++ setup.py 2020-11-12 14:14:42.105485851 +0100
@@ -337,7 +337,7 @@
################################################################################
# the list of runtime dependencies required by this built package
-install_requires = ['future', 'typing_extensions', 'dataclasses']
+install_requires = ['future']
missing_pydep = '''
Missing build dependency: Unable to `import {importname}`.
From 4ee880e1645da7c581a04de6cfe30d911f659f57 Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Thu, 15 Oct 2020 14:30:03 +0200
Subject: [PATCH] Distribute GPUs in round robin mode for distributed_test
The ProcessGroupNCCL::barrier implementation assumes that when
1 GPU/rank is used the GPU-Index equals the rank. Due to NCCL
communicator reuse this then leads to rank 0 using the (kinda)
temporary communicator while the other processes might use other GPUs
leading to them trying to create a new communicator and waiting for
rank 0 until that creates a new (potentially unrelated) one.
See #46248 for details
---
torch/testing/_internal/distributed/distributed_test.py | 6 ++----
1 file changed, 2 insertions(+), 4 deletions(-)
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index ccaccbda529aa..4d7c23b613d7c 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -362,16 +362,14 @@ def _init_multigpu_helper(self):
"""
nGPUs = torch.cuda.device_count()
world_size = dist.get_world_size()
- visible_devices = range(nGPUs)
if BACKEND == "nccl":
apply_hack_for_nccl()
nGPUs_per_process = nGPUs // world_size
rank_to_GPU = {
- i: list(
- visible_devices[i * nGPUs_per_process: (i + 1) * nGPUs_per_process]
- )
+ # Each rank has to get the GPU with the index equal to its rank
+ i: [i + gpu_num * world_size for gpu_num in range(nGPUs_per_process)]
for i in range(world_size)
}
return rank_to_GPU
\ No newline at end of file
--- test/jit/test_recursive_script.py.orig 2020-11-11 18:40:22.959601697 +0100
+++ test/jit/test_recursive_script.py 2020-11-11 18:40:44.718092798 +0100
@@ -2,7 +2,6 @@
import os
import sys
import typing
-import typing_extensions
from typing import List, Dict, Optional, Tuple
import torch
@@ -174,7 +173,7 @@
self.checkModule(M1(), (torch.randn(2, 2),))
class M2(torch.nn.Module):
- x : typing_extensions.Final[int]
+ x : typing.Final[int]
def __init__(self):
super().__init__()
#!/usr/bin/env python
# Verify that PyTorch can JIT compile C++ extensions
# This requires at least Ninja and a working C++ compiler, preferably GCC
#
# Heavily based on the PyTorch tutorial for C++ extensions
# Author: Alexander Grund (TU Dresden)
from torch.utils.cpp_extension import load_inline
cpp_source = "torch::Tensor test_func(torch::Tensor x) { return x; }"
module = load_inline(name='inline_extension',
cpp_sources=cpp_source,
functions=['test_func'])
assert module
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment