From 679dec847efe2fe05a2cd2a60af03e3704fd41d6 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Sat, 18 Mar 2023 00:39:42 +0000 Subject: [PATCH] Use is_available instead of device_count to check for CUDA availability (#97043) There are some tests that incorrectly uses the number of GPU devices `torch.cuda.device_count() > 0` to check for CUDA availability instead of the default `torch.cuda.is_available()` call. This makes these tests more brittle when encountering infra flakiness on G5 runner using A10G, for example [test_pytorch_np](https://hud.pytorch.org/failure/FAILED%20test_tensorboard.py%3A%3ATestTensorBoardPyTorchNumpy%3A%3Atest_pytorch_np%20-%20RuntimeError%3A%20No%20CUDA%20GPUs%20are%20available). The underlying problem is that GPU devices could crash on these runner. While the root cause for that is unclear and we will try to upgrade to a new NVIDIA driver https://github.com/pytorch/pytorch/pull/96904 to see if it helps, we can also make these tests more resilient by using the correct check to skip tests correctly when GPU crashes. Pull Request resolved: https://github.com/pytorch/pytorch/pull/97043 Approved by: https://github.com/clee2000 --- test/test_sparse.py | 2 +- test/test_tensorboard.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test/test_sparse.py b/test/test_sparse.py index e6e1c2f7eea..140f3a29d80 100644 --- a/test/test_sparse.py +++ b/test/test_sparse.py @@ -2941,7 +2941,7 @@ class TestSparse(TestSparseBase): self.skipTest(f'requires_grad==True requires float or complex dtype, got {dtype}') self._test_empty_full(device, dtype, requires_grad) - if torch.cuda.device_count() > 0: + if torch.cuda.is_available(): self._test_empty_full(None, dtype, requires_grad) self._test_empty_full(torch.device('cuda:0'), dtype, requires_grad) diff --git a/test/test_tensorboard.py b/test/test_tensorboard.py index 5d2ef1ee4df..4d657911aef 100644 --- a/test/test_tensorboard.py +++ b/test/test_tensorboard.py @@ -94,14 +94,14 @@ class TestTensorBoardPyTorchNumpy(BaseTestCase): self.assertIsInstance(make_np(tensor), np.ndarray) # CUDA tensor - if torch.cuda.device_count() > 0: + if torch.cuda.is_available(): self.assertIsInstance(make_np(tensor.cuda()), np.ndarray) # regular variable self.assertIsInstance(make_np(torch.autograd.Variable(tensor)), np.ndarray) # CUDA variable - if torch.cuda.device_count() > 0: + if torch.cuda.is_available(): self.assertIsInstance(make_np(torch.autograd.Variable(tensor).cuda()), np.ndarray) # python primitive type