diff --git a/docs/source/main_classes/trainer.rst b/docs/source/main_classes/trainer.rst index cdc796c01..b0401750f 100644 --- a/docs/source/main_classes/trainer.rst +++ b/docs/source/main_classes/trainer.rst @@ -1507,6 +1507,35 @@ and ``total_num_steps`, ``warmup_max_lr``, ``warmup_num_steps`` and ``total_num_ + +fp32 Precision +======================================================================================================================= + +Deepspeed supports the full fp32 and the fp16 mixed precision. + +Because of the much reduced memory needs and faster speed one gets with the fp16 mixed precision, the only time you +will want to not use it is when the model you're using doesn't behave well under this training mode. Typically this +happens when the model wasn't pretrained in the fp16 mixed precision (e.g. often this happens with bf16-pretrained +models). Such models may overflow or underflow leading to ``NaN`` loss. If this is your case then you will want to use +the full fp32 mode, by explicitly disabling the otherwise default fp16 mixed precision mode with: + +.. code-block:: json + + { + "fp16": { + "enabled": "false", + } + } + +If you're using the Ampere-architecture based GPU, pytorch version 1.7 and higher will automatically switch to using +the much more efficient tf32 format for some operations, but the results will still be in fp32. For details and +benchmarks, please, see `TensorFloat-32(TF32) on Ampere devices +`__. The document includes +instructions on how to disable this automatic conversion if for some reason you prefer not to use it. + + + + Automatic Mixed Precision ======================================================================================================================= @@ -1532,11 +1561,6 @@ and the :class:`~transformers.Trainer` will automatically enable or disable it b This mode gets enabled when ``--fp16 --fp16_backend amp`` command line args are passed. -.. note:: - - At the moment DeepSpeed doesn't supported fp32 mode, though it will become available soon. Until then it will be - always set to ``true``. - You can also enable/disable this mode explicitly: .. code-block:: json @@ -1790,6 +1814,24 @@ stress on ``tensor([1.])``, or if you get an error where it says the parameter i larger multi-dimensional shape, this means that the parameter is partitioned and what you see is a ZeRO-3 placeholder. +Troubleshooting +======================================================================================================================= + +* ``deepspeed`` process gets killed at startup without a traceback + +If the ``deepspeed`` process gets killed at launch time without a traceback, that usually means that the program tried +to allocate more CPU memory than your system has or your process is allowed to allocate and the OS kernel killed that +process. This is because your configuration file most likely has either ``offload_optimizer`` or ``offload_param`` or +both configured to offload to ``cpu`` (or under ZeRO-2 ``cpu_offload`` is enabled). If you have NVMe, experiment with +offloading to NVMe if you're running under ZeRO-3. + +Work is being done to enable estimating how much memory is needed for a specific model: `PR +`__. + + + + + Notes ======================================================================================================================= diff --git a/setup.py b/setup.py index 974d5ca40..0942a76f6 100644 --- a/setup.py +++ b/setup.py @@ -90,7 +90,7 @@ _deps = [ "cookiecutter==1.7.2", "dataclasses", "datasets", - "deepspeed>=0.3.15", + "deepspeed>=0.3.16", "docutils==0.16.0", "fairscale>0.3", "faiss-cpu", diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index a5c90f86d..811f9d66c 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -7,7 +7,7 @@ deps = { "cookiecutter": "cookiecutter==1.7.2", "dataclasses": "dataclasses", "datasets": "datasets", - "deepspeed": "deepspeed>=0.3.15", + "deepspeed": "deepspeed>=0.3.16", "docutils": "docutils==0.16.0", "fairscale": "fairscale>0.3", "faiss-cpu": "faiss-cpu", diff --git a/src/transformers/integrations.py b/src/transformers/integrations.py index a2d6743a1..4ab15b9d5 100644 --- a/src/transformers/integrations.py +++ b/src/transformers/integrations.py @@ -374,10 +374,7 @@ class DeepSpeedConfigHF: # amp: similar to the pytorch native amp - it has a bunch of optional params but we won't set # any here unless the user did the work config_fp16 = config.get("fp16") - # XXX: at the moment fp16 can't be False, but the fp32 solution is in works - once it's PR'ed and - # merged and a new release is made, delete the next line and uncomment the one after it - _set_if_auto(config_fp16, "enabled", True) - # _set_if_auto(config_fp16, "enabled", fp16_backend == "amp") + _set_if_auto(config_fp16, "enabled", fp16_backend == "amp") # apex: delegates amp work to apex (which needs to be available), but it cannot be used with any # ZeRO features, so probably best to be avoided. diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 7b1f477af..66875a028 100755 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -44,7 +44,7 @@ from .file_utils import ( replace_return_docstrings, ) from .generation_utils import GenerationMixin -from .integrations import is_deepspeed_zero3_enabled +from .integrations import deepspeed_config, is_deepspeed_zero3_enabled from .utils import logging @@ -1124,10 +1124,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix logger.info("Detected DeepSpeed ZeRO-3: activating zero.init() for this model") # this immediately partitions the model across all gpus, to avoid the overhead in time # and memory copying it on CPU or each GPU first - - # XXX: param_dict will be added in deepspeed==0.3.16 and probably replaced by deepspeed_config - # with deepspeed.zero.Init(param_dict=deepspeed_config()): - with deepspeed.zero.Init(): + with deepspeed.zero.Init(config=deepspeed_config()): model = cls(config, *model_args, **model_kwargs) else: model = cls(config, *model_args, **model_kwargs) diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py index 52f9bd72f..0c829e593 100644 --- a/tests/deepspeed/test_deepspeed.py +++ b/tests/deepspeed/test_deepspeed.py @@ -48,6 +48,7 @@ with ExtendSysPath(f"{bindir}/.."): set_seed(42) MBART_TINY = "sshleifer/tiny-mbart" T5_SMALL = "t5-small" +T5_TINY = "patrickvonplaten/t5-tiny-random" def load_json(path): @@ -108,25 +109,31 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon): MASTER_ADDR="localhost", MASTER_PORT="10999", RANK="0", LOCAL_RANK="0", WORLD_SIZE="1" ) - self.ds_config_file = {} - self.ds_config_file[ZERO2] = f"{self.test_file_dir_str}/ds_config_zero2.json" - self.ds_config_file[ZERO3] = f"{self.test_file_dir_str}/ds_config_zero3.json" + self.ds_config_file = dict( + zero2=f"{self.test_file_dir_str}/ds_config_zero2.json", + zero3=f"{self.test_file_dir_str}/ds_config_zero3.json", + ) # use self.get_config_dict(stage) to use these to ensure the original is not modified - self.ds_config_dict = {} with io.open(self.ds_config_file[ZERO2], "r", encoding="utf-8") as f: - self.ds_config_dict[ZERO2] = json.load(f) + config_zero2 = json.load(f) + # by default use fp16 + config_zero2["fp16"]["enabled"] = True with io.open(self.ds_config_file[ZERO3], "r", encoding="utf-8") as f: - self.ds_config_dict[ZERO3] = json.load(f) - - def get_config_dict(self, stage): - """As the tests modify the dict, always make a copy""" - config = deepcopy(self.ds_config_dict[stage]) - if stage == ZERO3: + config_zero3 = json.load(f) + # by default use fp16 + config_zero3["fp16"]["enabled"] = True # This setting slows things down, so don't enable it by default unless needed by a test. # It's in the file as a demo for users since we want everything to work out of the box even if slower. - config["zero_optimization"]["stage3_gather_fp16_weights_on_model_save"] = False - return config + config_zero3["zero_optimization"]["stage3_gather_fp16_weights_on_model_save"] = False + self.ds_config_dict = dict( + zero2=config_zero2, + zero3=config_zero3, + ) + + def get_config_dict(self, stage): + # As some tests modify the dict, always make a copy + return deepcopy(self.ds_config_dict[stage]) # --- These tests are enough to run on one of zero stages --- # @@ -192,24 +199,6 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon): # --- These tests need to run on both zero stages --- # - @parameterized.expand(stages) - def test_fp32(self, stage): - ds_config_dict = self.get_config_dict(stage) - ds_config_dict["fp16"]["enabled"] = False # force non-fp16 mode - - # XXX: do we go via from_pretrained in zero 3 here? need to test zero.Init(dtype=torch.float) - - # XXX: rewrite this test once fp32 is supported by DeepSpeed - with mockenv_context(**self.dist_env_1_gpu): - trainer = get_regression_trainer(local_rank=0, deepspeed=ds_config_dict) - with self.assertRaises(Exception) as context: - trainer.train() - self.assertIn( - "ZeRO is only supported if fp16 is enabled", - str(context.exception), - f"got exception: {context.exception}", - ) - @parameterized.expand(stages) def test_hf_optimizer_with_offload(self, stage): # must not allow non-DS optimizer when using ZERO-offload @@ -239,7 +228,7 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon): # it's run not as a first test as `sys.stdout` will no longer be the same. So we either have # to reset `deepspeed_logger.handlers[0].setStream(sys.stdout)` or directly capture from the deepspeed_logger. with mockenv_context(**self.dist_env_1_gpu): - trainer = get_regression_trainer(local_rank=0, deepspeed=self.ds_config_file[stage]) + trainer = get_regression_trainer(local_rank=0, deepspeed=self.get_config_dict(stage)) with CaptureLogger(deepspeed_logger) as cs: trainer.train() self.assertIn("DeepSpeed info", cs.out, "expected DeepSpeed logger output but got none") @@ -259,7 +248,7 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon): b=b, local_rank=0, train_len=8, - deepspeed=self.ds_config_file[stage], + deepspeed=self.get_config_dict(stage), per_device_train_batch_size=8, logging_steps=1, ) @@ -267,7 +256,11 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon): post_train_a = trainer.model.a.item() # XXX: for some reason the following check fails with zero3 - not a broken but a - # different qualitative outcome - need to investigate at some point + # different qualitative outcome - as if optimizer did run + # oddly getting 1.0 for both a and b from 0.0 - there is a bug somewhere + # print(trainer.model.a.item()) + # print(trainer.model.b.item()) + # need to investigate at some point if stage == ZERO3: return @@ -298,7 +291,7 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon): b=b, local_rank=0, train_len=train_len, - deepspeed=self.ds_config_file[stage], + deepspeed=self.get_config_dict(stage), per_device_train_batch_size=8, gradient_accumulation_steps=1, ) @@ -315,7 +308,7 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon): b=b, local_rank=0, train_len=train_len, - deepspeed=self.ds_config_file[stage], + deepspeed=self.get_config_dict(stage), per_device_train_batch_size=4, gradient_accumulation_steps=2, ) @@ -532,6 +525,35 @@ class TestDeepSpeedWithLauncher(TestCasePlus): do_eval=True, ) + @parameterized.expand(stages) + def test_fp32_non_distributed(self, stage): + # real model needs too much GPU memory under stage2+fp32, so using tiny random model here - + # therefore no quality checks, just basic completion checks are done + self.run_and_check( + stage=stage, + model_name=T5_TINY, + distributed=False, + do_train=True, + do_eval=True, + quality_checks=False, + fp16=False, + ) + + @require_torch_multi_gpu + @parameterized.expand(stages) + def test_fp32_distributed(self, stage): + # real model needs too much GPU memory under stage2+fp32, so using tiny random model here - + # therefore no quality checks, just basic completion checks are done + self.run_and_check( + stage=stage, + model_name=T5_TINY, + distributed=True, + do_train=True, + do_eval=True, + quality_checks=False, + fp16=False, + ) + @parameterized.expand(stages) def test_resume_train_not_from_ds_checkpoint(self, stage): # do normal training and then resume not from the deepspeed checkpoint but explicitly from @@ -550,44 +572,50 @@ class TestDeepSpeedWithLauncher(TestCasePlus): self.do_checks(output_dir, do_train=do_train, do_eval=do_eval) - def do_checks(self, output_dir, do_train=True, do_eval=True): + def do_checks(self, output_dir, do_train=True, do_eval=True, quality_checks=True): if do_train: train_metrics = load_json(os.path.join(output_dir, "train_results.json")) self.assertIn("train_samples_per_second", train_metrics) - self.assertGreater(train_metrics["train_samples_per_second"], 0.5) + if quality_checks: + self.assertGreater(train_metrics["train_samples_per_second"], 0.5) if do_eval: eval_metrics = load_json(os.path.join(output_dir, "eval_results.json")) self.assertIn("eval_bleu", eval_metrics) - self.assertGreater(eval_metrics["eval_bleu"], 0) + if quality_checks: + self.assertGreater(eval_metrics["eval_bleu"], 1) # XXX: need to do better validation beyond just that the run was successful def run_and_check( self, stage, - eval_steps=10, - distributed=True, - do_train=True, - do_eval=True, - extra_args_str=None, - remove_args_str=None, + model_name: str = T5_SMALL, + eval_steps: int = 10, + distributed: bool = True, + do_train: bool = True, + do_eval: bool = True, + quality_checks: bool = True, + fp16: bool = True, + extra_args_str: str = None, + remove_args_str: str = None, ): # we are doing quality testing so using a small real model output_dir = self.run_trainer( stage=stage, - model_name=T5_SMALL, + model_name=model_name, eval_steps=eval_steps, num_train_epochs=1, do_train=do_train, do_eval=do_eval, distributed=distributed, + fp16=fp16, extra_args_str=extra_args_str, remove_args_str=remove_args_str, ) - self.do_checks(output_dir, do_train=do_train, do_eval=do_eval) + self.do_checks(output_dir, do_train=do_train, do_eval=do_eval, quality_checks=quality_checks) return output_dir @@ -600,6 +628,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus): do_train: bool = False, do_eval: bool = True, distributed: bool = True, + fp16: bool = True, extra_args_str: str = None, remove_args_str: str = None, ): @@ -629,6 +658,9 @@ class TestDeepSpeedWithLauncher(TestCasePlus): """.split() args.extend(["--source_prefix", '"translate English to Romanian: "']) + if fp16: + args.extend(["--fp16"]) + actions = 0 if do_train: actions += 1 @@ -636,7 +668,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus): f""" --do_train --num_train_epochs {str(num_train_epochs)} - --max_train_samples 100 + --max_train_samples 16 --per_device_train_batch_size 2 --learning_rate 3e-3 """.split() @@ -647,7 +679,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus): args.extend( """ --do_eval - --max_eval_samples 100 + --max_eval_samples 16 --per_device_eval_batch_size 2 """.split() ) @@ -688,13 +720,14 @@ class TestDeepSpeedWithLauncher(TestCasePlus): --overwrite_output_dir --do_train --do_eval - --max_train_samples 10 - --max_eval_samples 10 - --per_device_train_batch_size 5 - --per_device_eval_batch_size 5 + --max_train_samples 16 + --max_eval_samples 16 + --per_device_train_batch_size 2 + --per_device_eval_batch_size 2 --num_train_epochs 1 --warmup_steps 8 - --block_size 128 + --block_size 64 + --fp16 --report_to none """.split()