From 679dec847efe2fe05a2cd2a60af03e3704fd41d6 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Sat, 18 Mar 2023 00:39:42 +0000
Subject: [PATCH] Use is_available instead of device_count to check for CUDA
 availability (#97043)

There are some tests that incorrectly uses the number of GPU devices `torch.cuda.device_count() > 0` to check for CUDA availability instead of the default `torch.cuda.is_available()` call.  This makes these tests more brittle when encountering infra flakiness on G5 runner using A10G, for example [test_pytorch_np](https://hud.pytorch.org/failure/FAILED%20test_tensorboard.py%3A%3ATestTensorBoardPyTorchNumpy%3A%3Atest_pytorch_np%20-%20RuntimeError%3A%20No%20CUDA%20GPUs%20are%20available).

The underlying problem is that GPU devices could crash on these runner.  While the root cause for that is unclear and we will try to upgrade to a new NVIDIA driver https://github.com/pytorch/pytorch/pull/96904 to see if it helps, we can also make these tests more resilient by using the correct check to skip tests correctly when GPU crashes.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/97043
Approved by: https://github.com/clee2000
---
 test/test_sparse.py      | 2 +-
 test/test_tensorboard.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/test_sparse.py b/test/test_sparse.py
index e6e1c2f7eea..140f3a29d80 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -2941,7 +2941,7 @@ class TestSparse(TestSparseBase):
             self.skipTest(f'requires_grad==True requires float or complex dtype, got {dtype}')
 
         self._test_empty_full(device, dtype, requires_grad)
-        if torch.cuda.device_count() > 0:
+        if torch.cuda.is_available():
             self._test_empty_full(None, dtype, requires_grad)
             self._test_empty_full(torch.device('cuda:0'), dtype, requires_grad)
 
diff --git a/test/test_tensorboard.py b/test/test_tensorboard.py
index 5d2ef1ee4df..4d657911aef 100644
--- a/test/test_tensorboard.py
+++ b/test/test_tensorboard.py
@@ -94,14 +94,14 @@ class TestTensorBoardPyTorchNumpy(BaseTestCase):
             self.assertIsInstance(make_np(tensor), np.ndarray)
 
             # CUDA tensor
-            if torch.cuda.device_count() > 0:
+            if torch.cuda.is_available():
                 self.assertIsInstance(make_np(tensor.cuda()), np.ndarray)
 
             # regular variable
             self.assertIsInstance(make_np(torch.autograd.Variable(tensor)), np.ndarray)
 
             # CUDA variable
-            if torch.cuda.device_count() > 0:
+            if torch.cuda.is_available():
                 self.assertIsInstance(make_np(torch.autograd.Variable(tensor).cuda()), np.ndarray)
 
         # python primitive type