diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py
index 9daad85b0..2352cf522 100644
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -38,17 +38,18 @@ from transformers.testing_utils import (
     CaptureStderr,
     LoggingLevel,
     TestCasePlus,
+    backend_device_count,
     execute_subprocess_async,
-    get_gpu_count,
     mockenv_context,
     require_deepspeed,
     require_optuna,
-    require_torch_gpu,
-    require_torch_multi_gpu,
+    require_torch_accelerator,
+    require_torch_multi_accelerator,
     slow,
+    torch_device,
 )
 from transformers.trainer_utils import get_last_checkpoint, set_seed
-from transformers.utils import SAFE_WEIGHTS_NAME, is_torch_bf16_gpu_available
+from transformers.utils import SAFE_WEIGHTS_NAME, is_torch_bf16_available_on_device
 
 
 if is_torch_available():
@@ -125,7 +126,7 @@ def get_launcher(distributed=False):
     # - it won't be able to handle that
     # 2. for now testing with just 2 gpus max (since some quality tests may give different
     # results with mode gpus because we use very little data)
-    num_gpus = min(2, get_gpu_count()) if distributed else 1
+    num_gpus = min(2, backend_device_count(torch_device)) if distributed else 1
     master_port = get_master_port(real_launcher=True)
     return f"deepspeed --num_nodes 1 --num_gpus {num_gpus} --master_port {master_port}".split()
 
@@ -145,7 +146,7 @@ optims = [HF_OPTIM, DS_OPTIM]
 schedulers = [HF_SCHEDULER, DS_SCHEDULER]
 
 stages = [ZERO2, ZERO3]
-if is_torch_bf16_gpu_available():
+if is_torch_bf16_available_on_device(torch_device):
     dtypes = [FP16, BF16]
 else:
     dtypes = [FP16]
@@ -165,7 +166,7 @@ params_with_optims_and_schedulers = list(itertools.product(stages, dtypes, optim
 
 
 @require_deepspeed
-@require_torch_gpu
+@require_torch_accelerator
 class CoreIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
     """
     Testing non-Trainer DeepSpeed integration
@@ -273,7 +274,7 @@ class TrainerIntegrationDeepSpeedWithCustomConfig(TestCasePlus):
 
 
 @require_deepspeed
-@require_torch_gpu
+@require_torch_accelerator
 class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, TrainerIntegrationCommon):
     """
 
@@ -875,7 +876,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
 
 @slow
 @require_deepspeed
-@require_torch_gpu
+@require_torch_accelerator
 class TestDeepSpeedWithLauncher(TestCasePlus):
     """This class is for testing via an external script - can do multiple gpus"""
 
@@ -896,7 +897,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
     #
 
     @parameterized.expand(params, name_func=parameterized_custom_name_func)
-    @require_torch_multi_gpu
+    @require_torch_multi_accelerator
     def test_basic_distributed(self, stage, dtype):
         self.run_and_check(stage=stage, dtype=dtype, distributed=True)
 
@@ -927,7 +928,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
         )
 
     @parameterized.expand(params, name_func=parameterized_custom_name_func)
-    @require_torch_multi_gpu
+    @require_torch_multi_accelerator
     def test_fp32_distributed(self, stage, dtype):
         # real model needs too much GPU memory under stage2+fp32, so using tiny random model here -
         # therefore no quality checks, just basic completion checks are done
@@ -968,9 +969,9 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
         self.do_checks(output_dir, do_train=do_train, do_eval=do_eval)
 
     @parameterized.expand(["bf16", "fp16", "fp32"])
-    @require_torch_multi_gpu
+    @require_torch_multi_accelerator
     def test_inference(self, dtype):
-        if dtype == "bf16" and not is_torch_bf16_gpu_available():
+        if dtype == "bf16" and not is_torch_bf16_available_on_device(torch_device):
             self.skipTest("test requires bfloat16 hardware support")
 
         # this is just inference, so no optimizer should be loaded