diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py index 00aa0e9ef69..de33e852307 100644 --- a/test/distributed/test_c10d_common.py +++ b/test/distributed/test_c10d_common.py @@ -133,7 +133,7 @@ class AbstractTimeoutTest: class TimeoutTest(TestCase): @retry_on_connect_failures def test_store_based_barrier(self): - f = tempfile.NamedTemporaryFile() + f = tempfile.NamedTemporaryFile(delete=False) port = common.find_free_port() def thread_work(timeout, init_type, world_size, rank, error_list): @@ -1756,7 +1756,7 @@ class ProcessGroupWithDispatchedCollectivesTests(MultiProcessTestCase): pass def test_init_process_group_optional_backend(self): - with tempfile.NamedTemporaryFile() as f: + with tempfile.NamedTemporaryFile(delete=False) as f: store = dist.FileStore(f.name, self.world_size) # creates both gloo and nccl backend if dist.is_gloo_available() and dist.is_nccl_available(): @@ -1785,7 +1785,7 @@ class ProcessGroupWithDispatchedCollectivesTests(MultiProcessTestCase): if not dist.is_ucc_available(): continue - with tempfile.NamedTemporaryFile() as f: + with tempfile.NamedTemporaryFile(delete=False) as f: store = dist.FileStore(f.name, self.world_size) dist.init_process_group( backend=backend, diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py index 15fc8e353c6..c34ea263787 100644 --- a/test/distributed/test_c10d_nccl.py +++ b/test/distributed/test_c10d_nccl.py @@ -1197,6 +1197,31 @@ class ProcessGroupNCCLTest(MultiProcessTestCase): with self.assertRaises(dist.DistBackendError): pg.allreduce([t]) + @requires_nccl() + @skip_but_pass_in_sandcastle_if(torch.cuda.device_count() < 2, "NCCL test requires 2+ GPUs") + def test_file_store_check(self): + os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = "0" + os.environ["TORCH_NCCL_ENABLE_MONITORING"] = "0" + # FileStore check() would be executed + os.environ["TORCH_NCCL_DUMP_ON_TIMEOUT"] = "1" + os.environ["TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC"] = "0" + + # self.file_name is created using "delete=False" + # e.g., self.file_name = tempfile.NamedTemporaryFile(delete=False).name + store = dist.FileStore(self.file_name, self.world_size) + dist.init_process_group( + backend="nccl", + rank=self.rank, + world_size=self.world_size, + store=store + ) + pg = dist.distributed_c10d._get_default_group() + self.assertEqual(pg.rank(), self.rank) + self.assertEqual(pg.size(), self.world_size) + # give enough time for check() to be executed multiple times + time.sleep(2) + dist.destroy_process_group() + def _check_nccl_timeout(self, expected_timeout): pg = dist.distributed_c10d._get_default_group() options = pg._get_backend(torch.device(f"cuda:{self.rank}")).options