From 9e299b883be8c8efd672406d5cd68a3f5baebb44 Mon Sep 17 00:00:00 2001 From: Ke Wen Date: Mon, 25 Nov 2024 14:08:27 -0800 Subject: [PATCH] [c10d] Test needs abort; otherwise will hang (#141509) Pull Request resolved: https://github.com/pytorch/pytorch/pull/141509 Approved by: https://github.com/wz337, https://github.com/fduwjj --- test/distributed/test_c10d_nccl.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py index 73c29c640da..15e38033310 100644 --- a/test/distributed/test_c10d_nccl.py +++ b/test/distributed/test_c10d_nccl.py @@ -2913,6 +2913,9 @@ class NcclErrorHandlingTest(MultiProcessTestCase): # nccl error happening before rank 0 timeouts time.sleep(4) + # Mimicing all ranks sensing the timeout, abort + process_group.abort() + if prev_nccl_async_error_handling is not None: os.environ[ "TORCH_NCCL_ASYNC_ERROR_HANDLING"