Enable test_distributed to work with spawn mode (#41769)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/41769

Currently the tests in `test_distributed` only work with the `fork` mode multiprocessing, this PR introduces support for `spawn` mode multiprocessing as well (while keeping the `fork` mode intact).

Motivations for the change:
1) Spawn multiprocessing is the default on MacOS, so it better emulates how MacOS users would use distributed
2) With python 3.8+, spawn is the default on linux, so we should have test coverage for this
3) PT multiprocessing suggests using spawn/forkserver over fork, for sharing cuda tensors: https://pytorch.org/docs/stable/multiprocessing.html
4) Spawn is better supported with respect to certain sanitizers such as TSAN, so adding this sanitizer coverage may help us uncover issues.

How it is done:
1) Move `test_distributed` tests in `_DistTestBase` class to a shared file `distributed_test` (similar to how the RPC tests are structured)
2) For `Barrier`, refactor the setup of temp directories, as the current version did not work with spawn, each process would get a different randomly generated directory and thus would write to different barriers.
3) Add all the relevant builds to run internally and in OSS.
Running test_distributed with spawn mode in OSS can be done with:
`python test/run_test.py -i distributed/test_distributed_spawn -v`

Reviewed By: izdeby

Differential Revision: D22408023

fbshipit-source-id: e206be16961fd80438f995e221f18139d7e6d2a9
This commit is contained in:
Rohan Varma 2020-09-08 23:08:55 -07:00 committed by Facebook GitHub Bot
parent 1d01fcdc24
commit b22abbe381
5 changed files with 3223 additions and 3170 deletions

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,35 @@
from __future__ import absolute_import, division, print_function, unicode_literals
import os
import sys
import unittest
import torch.distributed as dist
from torch.testing._internal.common_utils import run_tests, TEST_WITH_ASAN, NO_MULTIPROCESSING_SPAWN
from torch.testing._internal.distributed.distributed_test import (
DistributedTest, TestDistBackend
)
if not dist.is_available():
print("Distributed not available, skipping tests", file=sys.stderr)
sys.exit(0)
BACKEND = os.environ["BACKEND"]
if BACKEND == "gloo" or BACKEND == "nccl":
@unittest.skipIf(
TEST_WITH_ASAN, "Skip ASAN as torch + multiprocessing spawn have known issues"
)
@unittest.skipIf(
NO_MULTIPROCESSING_SPAWN, "Spawn not available, skipping tests."
)
class TestDistBackendWithSpawn(TestDistBackend, DistributedTest._DistTestBase):
def setUp(self):
super().setUp()
self._spawn_processes()
if __name__ == "__main__":
run_tests()

View file

@ -35,6 +35,7 @@ TESTS = [
'test_dataloader',
'distributed/test_data_parallel',
'distributed/test_distributed',
'distributed/test_distributed_spawn',
'test_distributions',
'test_expecttest',
'test_foreach',
@ -96,6 +97,7 @@ WINDOWS_BLOCKLIST = [
'distributed/rpc/test_process_group_agent',
'distributed/rpc/test_tensorpipe_agent',
'distributed/test_distributed',
'distributed/test_distributed_spawn',
]
ROCM_BLOCKLIST = [
@ -142,6 +144,7 @@ SLOW_TESTS = [
'distributed/rpc/test_process_group_agent',
'distributed/rpc/test_tensorpipe_agent',
'distributed/algorithms/ddp_comm_hooks/test_ddp_hooks',
'distributed/test_distributed_spawn',
'test_cuda',
'test_cuda_primary_ctx',
'test_cpp_extensions_aot_ninja',
@ -306,7 +309,8 @@ def test_distributed(test_module, test_directory, options):
for with_init_file in {True, False}:
tmp_dir = tempfile.mkdtemp()
if options.verbose:
with_init = ' with file init_method' if with_init_file else ''
init_str = "with {} init_method"
with_init = init_str.format("file" if with_init_file else "env")
print_to_stderr(
'Running distributed tests for the {} backend{}'.format(
backend, with_init))
@ -315,7 +319,7 @@ def test_distributed(test_module, test_directory, options):
os.environ['INIT_METHOD'] = 'env://'
os.environ.update(env_vars)
if with_init_file:
if test_module == "test_distributed":
if test_module in ["test_distributed", "test_distributed_spawn"]:
init_method = 'file://{}/'.format(tmp_dir)
else:
init_method = 'file://{}/shared_init_file'.format(tmp_dir)
@ -348,6 +352,7 @@ CUSTOM_HANDLERS = {
'test_cpp_extensions_aot_no_ninja': test_cpp_extensions_aot_no_ninja,
'test_cpp_extensions_aot_ninja': test_cpp_extensions_aot_ninja,
'distributed/test_distributed': test_distributed,
'distributed/test_distributed_spawn': test_distributed,
}

View file

@ -194,6 +194,26 @@ def simple_sparse_reduce_tests(rank, world_size, num_inputs=1):
]
]
tmp_dir = None
def initialize_temp_directories(init_method=None):
global tmp_dir
tmp_dir = tempfile.TemporaryDirectory()
os.environ["TEMP_DIR"] = tmp_dir.name
os.mkdir(os.path.join(tmp_dir.name, "barrier"))
os.mkdir(os.path.join(tmp_dir.name, "test_dir"))
init_dir_path = os.path.join(tmp_dir.name, "init_dir")
os.mkdir(init_dir_path)
# Set init method if specified.
if init_method is not None:
os.environ["INIT_METHOD"] = init_method
else:
os.environ["INIT_METHOD"] = "file://" + os.path.join(
init_dir_path, "shared_init_file"
)
def cleanup_temp_dir():
if tmp_dir is not None:
tmp_dir.cleanup()
# [How does MultiProcessTestCase work?]
# Each MultiProcessTestCase instance uses 1 + `world_size()` processes, by
@ -243,6 +263,7 @@ class MultiProcessTestCase(TestCase):
def setUp(self):
super().setUp()
self.skip_return_code_checks = []
self.processes = []
self.rank = self.MAIN_PROCESS_RANK
self.file_name = tempfile.NamedTemporaryFile(delete=False).name

File diff suppressed because it is too large Load diff