pytorch/test/distributed/test_distributed.py
Rohan Varma b22abbe381 Enable test_distributed to work with spawn mode (#41769)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/41769

Currently the tests in `test_distributed` only work with the `fork` mode multiprocessing, this PR introduces support for `spawn` mode multiprocessing as well (while keeping the `fork` mode intact).

Motivations for the change:
1) Spawn multiprocessing is the default on MacOS, so it better emulates how MacOS users would use distributed
2) With python 3.8+, spawn is the default on linux, so we should have test coverage for this
3) PT multiprocessing suggests using spawn/forkserver over fork, for sharing cuda tensors: https://pytorch.org/docs/stable/multiprocessing.html
4) Spawn is better supported with respect to certain sanitizers such as TSAN, so adding this sanitizer coverage may help us uncover issues.

How it is done:
1) Move `test_distributed` tests in `_DistTestBase` class to a shared file `distributed_test` (similar to how the RPC tests are structured)
2) For `Barrier`, refactor the setup of temp directories, as the current version did not work with spawn, each process would get a different randomly generated directory and thus would write to different barriers.
3) Add all the relevant builds to run internally and in OSS.
Running test_distributed with spawn mode in OSS can be done with:
`python test/run_test.py -i distributed/test_distributed_spawn -v`

Reviewed By: izdeby

Differential Revision: D22408023

fbshipit-source-id: e206be16961fd80438f995e221f18139d7e6d2a9
2020-09-08 23:11:12 -07:00

106 lines
3.3 KiB
Python

from __future__ import absolute_import, division, print_function, unicode_literals
import os
import sys
import tempfile
from functools import wraps
import torch
import torch.cuda
import torch.distributed as dist
from torch.testing._internal.common_utils import TestCase, find_free_port, run_tests
from torch.distributed.distributed_c10d import _get_default_group
from torch.testing._internal.distributed.distributed_test import (
DistributedTest, TestDistBackend
)
CPP_EXTENSIONS_WARNING = """
Ninja (https://ninja-build.org) must be available to run C++ extensions tests,
but it could not be found. Install ninja with `pip install ninja`
or `conda install ninja`.
"""
if not dist.is_available():
print("Distributed not available, skipping tests", file=sys.stderr)
sys.exit(0)
BACKEND = os.environ["BACKEND"]
INIT_METHOD = os.getenv("INIT_METHOD", "env://")
def skip_if_no_ninja(func):
@wraps(func)
def wrapper(*args, **kwargs):
try:
import torch.utils.cpp_extension
torch.utils.cpp_extension.verify_ninja_availability()
except RuntimeError:
print(CPP_EXTENSIONS_WARNING)
return 0
return func(*args, **kwargs)
return wrapper
if BACKEND == "gloo" or BACKEND == "nccl":
class TestDistBackendWithFork(TestDistBackend, DistributedTest._DistTestBase):
def setUp(self):
super().setUp()
self._fork_processes()
elif BACKEND == "mpi":
WORLD_SIZE = os.environ["WORLD_SIZE"]
dist.init_process_group(init_method=INIT_METHOD, backend="mpi")
class TestMPI(DistributedTest._DistTestBase):
pass
elif BACKEND == "test":
class TestBackendDynamicLoad(TestCase):
def setUp(self):
super(TestBackendDynamicLoad, self).setUp()
def _load_test_backend(self):
temp_dir = tempfile.mkdtemp()
src = "{}/../cpp_extensions/cpp_c10d_extension.cpp".format(os.path.abspath(os.path.dirname(__file__)))
extension = torch.utils.cpp_extension.load(
name="torch_test",
sources=[src],
build_directory=temp_dir
)
@skip_if_no_ninja
def test_backend_apis(self):
self._load_test_backend()
os.environ['WORLD_SIZE'] = '1'
os.environ['MASTER_ADDR'] = '127.0.0.1'
os.environ['MASTER_PORT'] = str(find_free_port())
os.environ['RANK'] = '0'
dist.init_process_group(backend='test', init_method='env://', world_size=1, rank=0)
self.assertEqual(dist.get_rank(), 0)
self.assertEqual(dist.get_world_size(), 1)
process_group = _get_default_group()
work = process_group.allreduce([torch.rand(1), torch.rand(1)])
self.assertTrue(work.wait())
self.assertTrue(work.is_completed())
self.assertTrue(work.is_success())
work = process_group.broadcast([torch.rand(1)])
self.assertTrue(work.wait())
self.assertTrue(work.is_completed())
self.assertTrue(work.is_success())
dist.destroy_process_group()
if __name__ == "__main__":
assert (
not torch.cuda._initialized
), "test_distributed must not have initialized CUDA context on main process"
run_tests()