diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py index 9daad85b0..2352cf522 100644 --- a/tests/deepspeed/test_deepspeed.py +++ b/tests/deepspeed/test_deepspeed.py @@ -38,17 +38,18 @@ from transformers.testing_utils import ( CaptureStderr, LoggingLevel, TestCasePlus, + backend_device_count, execute_subprocess_async, - get_gpu_count, mockenv_context, require_deepspeed, require_optuna, - require_torch_gpu, - require_torch_multi_gpu, + require_torch_accelerator, + require_torch_multi_accelerator, slow, + torch_device, ) from transformers.trainer_utils import get_last_checkpoint, set_seed -from transformers.utils import SAFE_WEIGHTS_NAME, is_torch_bf16_gpu_available +from transformers.utils import SAFE_WEIGHTS_NAME, is_torch_bf16_available_on_device if is_torch_available(): @@ -125,7 +126,7 @@ def get_launcher(distributed=False): # - it won't be able to handle that # 2. for now testing with just 2 gpus max (since some quality tests may give different # results with mode gpus because we use very little data) - num_gpus = min(2, get_gpu_count()) if distributed else 1 + num_gpus = min(2, backend_device_count(torch_device)) if distributed else 1 master_port = get_master_port(real_launcher=True) return f"deepspeed --num_nodes 1 --num_gpus {num_gpus} --master_port {master_port}".split() @@ -145,7 +146,7 @@ optims = [HF_OPTIM, DS_OPTIM] schedulers = [HF_SCHEDULER, DS_SCHEDULER] stages = [ZERO2, ZERO3] -if is_torch_bf16_gpu_available(): +if is_torch_bf16_available_on_device(torch_device): dtypes = [FP16, BF16] else: dtypes = [FP16] @@ -165,7 +166,7 @@ params_with_optims_and_schedulers = list(itertools.product(stages, dtypes, optim @require_deepspeed -@require_torch_gpu +@require_torch_accelerator class CoreIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon): """ Testing non-Trainer DeepSpeed integration @@ -273,7 +274,7 @@ class TrainerIntegrationDeepSpeedWithCustomConfig(TestCasePlus): @require_deepspeed -@require_torch_gpu +@require_torch_accelerator class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, TrainerIntegrationCommon): """ @@ -875,7 +876,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T @slow @require_deepspeed -@require_torch_gpu +@require_torch_accelerator class TestDeepSpeedWithLauncher(TestCasePlus): """This class is for testing via an external script - can do multiple gpus""" @@ -896,7 +897,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus): # @parameterized.expand(params, name_func=parameterized_custom_name_func) - @require_torch_multi_gpu + @require_torch_multi_accelerator def test_basic_distributed(self, stage, dtype): self.run_and_check(stage=stage, dtype=dtype, distributed=True) @@ -927,7 +928,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus): ) @parameterized.expand(params, name_func=parameterized_custom_name_func) - @require_torch_multi_gpu + @require_torch_multi_accelerator def test_fp32_distributed(self, stage, dtype): # real model needs too much GPU memory under stage2+fp32, so using tiny random model here - # therefore no quality checks, just basic completion checks are done @@ -968,9 +969,9 @@ class TestDeepSpeedWithLauncher(TestCasePlus): self.do_checks(output_dir, do_train=do_train, do_eval=do_eval) @parameterized.expand(["bf16", "fp16", "fp32"]) - @require_torch_multi_gpu + @require_torch_multi_accelerator def test_inference(self, dtype): - if dtype == "bf16" and not is_torch_bf16_gpu_available(): + if dtype == "bf16" and not is_torch_bf16_available_on_device(torch_device): self.skipTest("test requires bfloat16 hardware support") # this is just inference, so no optimizer should be loaded