mirror of
https://github.com/saymrwulf/pytorch.git
synced 2026-05-14 20:57:59 +00:00
Enable test_distributed to work with spawn mode (#41769)
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/41769 Currently the tests in `test_distributed` only work with the `fork` mode multiprocessing, this PR introduces support for `spawn` mode multiprocessing as well (while keeping the `fork` mode intact). Motivations for the change: 1) Spawn multiprocessing is the default on MacOS, so it better emulates how MacOS users would use distributed 2) With python 3.8+, spawn is the default on linux, so we should have test coverage for this 3) PT multiprocessing suggests using spawn/forkserver over fork, for sharing cuda tensors: https://pytorch.org/docs/stable/multiprocessing.html 4) Spawn is better supported with respect to certain sanitizers such as TSAN, so adding this sanitizer coverage may help us uncover issues. How it is done: 1) Move `test_distributed` tests in `_DistTestBase` class to a shared file `distributed_test` (similar to how the RPC tests are structured) 2) For `Barrier`, refactor the setup of temp directories, as the current version did not work with spawn, each process would get a different randomly generated directory and thus would write to different barriers. 3) Add all the relevant builds to run internally and in OSS. Running test_distributed with spawn mode in OSS can be done with: `python test/run_test.py -i distributed/test_distributed_spawn -v` Reviewed By: izdeby Differential Revision: D22408023 fbshipit-source-id: e206be16961fd80438f995e221f18139d7e6d2a9
This commit is contained in:
parent
1d01fcdc24
commit
b22abbe381
5 changed files with 3223 additions and 3170 deletions
File diff suppressed because it is too large
Load diff
35
test/distributed/test_distributed_spawn.py
Normal file
35
test/distributed/test_distributed_spawn.py
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import os
|
||||
import sys
|
||||
import unittest
|
||||
|
||||
import torch.distributed as dist
|
||||
from torch.testing._internal.common_utils import run_tests, TEST_WITH_ASAN, NO_MULTIPROCESSING_SPAWN
|
||||
from torch.testing._internal.distributed.distributed_test import (
|
||||
DistributedTest, TestDistBackend
|
||||
)
|
||||
|
||||
if not dist.is_available():
|
||||
print("Distributed not available, skipping tests", file=sys.stderr)
|
||||
sys.exit(0)
|
||||
|
||||
BACKEND = os.environ["BACKEND"]
|
||||
|
||||
if BACKEND == "gloo" or BACKEND == "nccl":
|
||||
|
||||
@unittest.skipIf(
|
||||
TEST_WITH_ASAN, "Skip ASAN as torch + multiprocessing spawn have known issues"
|
||||
)
|
||||
@unittest.skipIf(
|
||||
NO_MULTIPROCESSING_SPAWN, "Spawn not available, skipping tests."
|
||||
)
|
||||
class TestDistBackendWithSpawn(TestDistBackend, DistributedTest._DistTestBase):
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self._spawn_processes()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_tests()
|
||||
|
|
@ -35,6 +35,7 @@ TESTS = [
|
|||
'test_dataloader',
|
||||
'distributed/test_data_parallel',
|
||||
'distributed/test_distributed',
|
||||
'distributed/test_distributed_spawn',
|
||||
'test_distributions',
|
||||
'test_expecttest',
|
||||
'test_foreach',
|
||||
|
|
@ -96,6 +97,7 @@ WINDOWS_BLOCKLIST = [
|
|||
'distributed/rpc/test_process_group_agent',
|
||||
'distributed/rpc/test_tensorpipe_agent',
|
||||
'distributed/test_distributed',
|
||||
'distributed/test_distributed_spawn',
|
||||
]
|
||||
|
||||
ROCM_BLOCKLIST = [
|
||||
|
|
@ -142,6 +144,7 @@ SLOW_TESTS = [
|
|||
'distributed/rpc/test_process_group_agent',
|
||||
'distributed/rpc/test_tensorpipe_agent',
|
||||
'distributed/algorithms/ddp_comm_hooks/test_ddp_hooks',
|
||||
'distributed/test_distributed_spawn',
|
||||
'test_cuda',
|
||||
'test_cuda_primary_ctx',
|
||||
'test_cpp_extensions_aot_ninja',
|
||||
|
|
@ -306,7 +309,8 @@ def test_distributed(test_module, test_directory, options):
|
|||
for with_init_file in {True, False}:
|
||||
tmp_dir = tempfile.mkdtemp()
|
||||
if options.verbose:
|
||||
with_init = ' with file init_method' if with_init_file else ''
|
||||
init_str = "with {} init_method"
|
||||
with_init = init_str.format("file" if with_init_file else "env")
|
||||
print_to_stderr(
|
||||
'Running distributed tests for the {} backend{}'.format(
|
||||
backend, with_init))
|
||||
|
|
@ -315,7 +319,7 @@ def test_distributed(test_module, test_directory, options):
|
|||
os.environ['INIT_METHOD'] = 'env://'
|
||||
os.environ.update(env_vars)
|
||||
if with_init_file:
|
||||
if test_module == "test_distributed":
|
||||
if test_module in ["test_distributed", "test_distributed_spawn"]:
|
||||
init_method = 'file://{}/'.format(tmp_dir)
|
||||
else:
|
||||
init_method = 'file://{}/shared_init_file'.format(tmp_dir)
|
||||
|
|
@ -348,6 +352,7 @@ CUSTOM_HANDLERS = {
|
|||
'test_cpp_extensions_aot_no_ninja': test_cpp_extensions_aot_no_ninja,
|
||||
'test_cpp_extensions_aot_ninja': test_cpp_extensions_aot_ninja,
|
||||
'distributed/test_distributed': test_distributed,
|
||||
'distributed/test_distributed_spawn': test_distributed,
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -194,6 +194,26 @@ def simple_sparse_reduce_tests(rank, world_size, num_inputs=1):
|
|||
]
|
||||
]
|
||||
|
||||
tmp_dir = None
|
||||
def initialize_temp_directories(init_method=None):
|
||||
global tmp_dir
|
||||
tmp_dir = tempfile.TemporaryDirectory()
|
||||
os.environ["TEMP_DIR"] = tmp_dir.name
|
||||
os.mkdir(os.path.join(tmp_dir.name, "barrier"))
|
||||
os.mkdir(os.path.join(tmp_dir.name, "test_dir"))
|
||||
init_dir_path = os.path.join(tmp_dir.name, "init_dir")
|
||||
os.mkdir(init_dir_path)
|
||||
# Set init method if specified.
|
||||
if init_method is not None:
|
||||
os.environ["INIT_METHOD"] = init_method
|
||||
else:
|
||||
os.environ["INIT_METHOD"] = "file://" + os.path.join(
|
||||
init_dir_path, "shared_init_file"
|
||||
)
|
||||
|
||||
def cleanup_temp_dir():
|
||||
if tmp_dir is not None:
|
||||
tmp_dir.cleanup()
|
||||
|
||||
# [How does MultiProcessTestCase work?]
|
||||
# Each MultiProcessTestCase instance uses 1 + `world_size()` processes, by
|
||||
|
|
@ -243,6 +263,7 @@ class MultiProcessTestCase(TestCase):
|
|||
def setUp(self):
|
||||
super().setUp()
|
||||
self.skip_return_code_checks = []
|
||||
self.processes = []
|
||||
self.rank = self.MAIN_PROCESS_RANK
|
||||
self.file_name = tempfile.NamedTemporaryFile(delete=False).name
|
||||
|
||||
|
|
|
|||
3151
torch/testing/_internal/distributed/distributed_test.py
Normal file
3151
torch/testing/_internal/distributed/distributed_test.py
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Reference in a new issue