[c10d] Test needs abort; otherwise will hang (#141509)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/141509
Approved by: https://github.com/wz337, https://github.com/fduwjj
This commit is contained in:
Ke Wen 2024-11-25 14:08:27 -08:00 committed by PyTorch MergeBot
parent 5accae4197
commit 9e299b883b

View file

@ -2913,6 +2913,9 @@ class NcclErrorHandlingTest(MultiProcessTestCase):
# nccl error happening before rank 0 timeouts
time.sleep(4)
# Mimicing all ranks sensing the timeout, abort
process_group.abort()
if prev_nccl_async_error_handling is not None:
os.environ[
"TORCH_NCCL_ASYNC_ERROR_HANDLING"