Enable test_distributed to work with spawn mode (#41769)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/41769 Currently the tests in `test_distributed` only work with the `fork` mode multiprocessing, this PR introduces support for `spawn` mode multiprocessing as well (while keeping the `fork` mode intact). Motivations for the change: 1) Spawn multiprocessing is the default on MacOS, so it better emulates how MacOS users would use distributed 2) With python 3.8+, spawn is the default on linux, so we should have test coverage for this 3) PT multiprocessing suggests using spawn/forkserver over fork, for sharing cuda tensors: https://pytorch.org/docs/stable/multiprocessing.html 4) Spawn is better supported with respect to certain sanitizers such as TSAN, so adding this sanitizer coverage may help us uncover issues. How it is done: 1) Move `test_distributed` tests in `_DistTestBase` class to a shared file `distributed_test` (similar to how the RPC tests are structured) 2) For `Barrier`, refactor the setup of temp directories, as the current version did not work with spawn, each process would get a different randomly generated directory and thus would write to different barriers. 3) Add all the relevant builds to run internally and in OSS. Running test_distributed with spawn mode in OSS can be done with: `python test/run_test.py -i distributed/test_distributed_spawn -v` Reviewed By: izdeby Differential Revision: D22408023 fbshipit-source-id: e206be16961fd80438f995e221f18139d7e6d2a9
2026-05-14 20:57:59 +00:00 · 2020-09-08 23:08:55 -07:00 · 2020-09-08 23:08:55 -07:00 · b22abbe381
commit b22abbe381
parent 1d01fcdc24
5 changed files with 3223 additions and 3170 deletions
--- a/test/distributed/test_distributed.py
+++ b/test/distributed/test_distributed.py
--- a/test/distributed/test_distributed_spawn.py
+++ b/test/distributed/test_distributed_spawn.py
@ -0,0 +1,35 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import os
+import sys
+import unittest
+
+import torch.distributed as dist
+from torch.testing._internal.common_utils import run_tests, TEST_WITH_ASAN, NO_MULTIPROCESSING_SPAWN
+from torch.testing._internal.distributed.distributed_test import (
+    DistributedTest, TestDistBackend
+)
+
+if not dist.is_available():
+    print("Distributed not available, skipping tests", file=sys.stderr)
+    sys.exit(0)
+
+BACKEND = os.environ["BACKEND"]
+
+if BACKEND == "gloo" or BACKEND == "nccl":
+
+    @unittest.skipIf(
+        TEST_WITH_ASAN, "Skip ASAN as torch + multiprocessing spawn have known issues"
+    )
+    @unittest.skipIf(
+        NO_MULTIPROCESSING_SPAWN, "Spawn not available, skipping tests."
+    )
+    class TestDistBackendWithSpawn(TestDistBackend, DistributedTest._DistTestBase):
+
+        def setUp(self):
+            super().setUp()
+            self._spawn_processes()
+
+
+if __name__ == "__main__":
+    run_tests()
--- a/test/run_test.py
+++ b/test/run_test.py
@ -35,6 +35,7 @@ TESTS = [
    'test_dataloader',
    'distributed/test_data_parallel',
    'distributed/test_distributed',
+    'distributed/test_distributed_spawn',
    'test_distributions',
    'test_expecttest',
    'test_foreach',
@ -96,6 +97,7 @@ WINDOWS_BLOCKLIST = [
    'distributed/rpc/test_process_group_agent',
    'distributed/rpc/test_tensorpipe_agent',
    'distributed/test_distributed',
+    'distributed/test_distributed_spawn',
 ]

 ROCM_BLOCKLIST = [
@ -142,6 +144,7 @@ SLOW_TESTS = [
    'distributed/rpc/test_process_group_agent',
    'distributed/rpc/test_tensorpipe_agent',
    'distributed/algorithms/ddp_comm_hooks/test_ddp_hooks',
+    'distributed/test_distributed_spawn',
    'test_cuda',
    'test_cuda_primary_ctx',
    'test_cpp_extensions_aot_ninja',
@ -306,7 +309,8 @@ def test_distributed(test_module, test_directory, options):
        for with_init_file in {True, False}:
            tmp_dir = tempfile.mkdtemp()
            if options.verbose:
-                with_init = ' with file init_method' if with_init_file else ''
+                init_str = "with {} init_method"
+                with_init = init_str.format("file" if with_init_file else "env")
                print_to_stderr(
                    'Running distributed tests for the {} backend{}'.format(
                        backend, with_init))
@ -315,7 +319,7 @@ def test_distributed(test_module, test_directory, options):
            os.environ['INIT_METHOD'] = 'env://'
            os.environ.update(env_vars)
            if with_init_file:
-                if test_module == "test_distributed":
+                if test_module in ["test_distributed", "test_distributed_spawn"]:
                    init_method = 'file://{}/'.format(tmp_dir)
                else:
                    init_method = 'file://{}/shared_init_file'.format(tmp_dir)
@ -348,6 +352,7 @@ CUSTOM_HANDLERS = {
    'test_cpp_extensions_aot_no_ninja': test_cpp_extensions_aot_no_ninja,
    'test_cpp_extensions_aot_ninja': test_cpp_extensions_aot_ninja,
    'distributed/test_distributed': test_distributed,
+    'distributed/test_distributed_spawn': test_distributed,
 }


--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@ -194,6 +194,26 @@ def simple_sparse_reduce_tests(rank, world_size, num_inputs=1):
        ]
    ]

+tmp_dir = None
+def initialize_temp_directories(init_method=None):
+    global tmp_dir
+    tmp_dir = tempfile.TemporaryDirectory()
+    os.environ["TEMP_DIR"] = tmp_dir.name
+    os.mkdir(os.path.join(tmp_dir.name, "barrier"))
+    os.mkdir(os.path.join(tmp_dir.name, "test_dir"))
+    init_dir_path = os.path.join(tmp_dir.name, "init_dir")
+    os.mkdir(init_dir_path)
+    # Set init method if specified.
+    if init_method is not None:
+        os.environ["INIT_METHOD"] = init_method
+    else:
+        os.environ["INIT_METHOD"] = "file://" + os.path.join(
+            init_dir_path, "shared_init_file"
+        )
+
+def cleanup_temp_dir():
+    if tmp_dir is not None:
+        tmp_dir.cleanup()

 # [How does MultiProcessTestCase work?]
 # Each MultiProcessTestCase instance uses 1 + `world_size()` processes, by
@ -243,6 +263,7 @@ class MultiProcessTestCase(TestCase):
    def setUp(self):
        super().setUp()
        self.skip_return_code_checks = []
+        self.processes = []
        self.rank = self.MAIN_PROCESS_RANK
        self.file_name = tempfile.NamedTemporaryFile(delete=False).name

--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py