diff --git a/setup.py b/setup.py index 839868fa4..dd5401fb7 100644 --- a/setup.py +++ b/setup.py @@ -98,7 +98,7 @@ _deps = [ "cookiecutter==1.7.2", "dataclasses", "datasets", - "deepspeed>=0.5.7", + "deepspeed>=0.5.9", "fairscale>0.3", "faiss-cpu", "fastapi", diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index 20f8e966e..b1e08fcd6 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -8,7 +8,7 @@ deps = { "cookiecutter": "cookiecutter==1.7.2", "dataclasses": "dataclasses", "datasets": "datasets", - "deepspeed": "deepspeed>=0.5.7", + "deepspeed": "deepspeed>=0.5.9", "fairscale": "fairscale>0.3", "faiss-cpu": "faiss-cpu", "fastapi": "fastapi", diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 635f4a24d..c587e4767 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -2054,7 +2054,12 @@ class Trainer: # now save the real model if stage3_gather_fp16_weights_on_model_save=True # if false it will not be saved. # This must be called on all ranks - self.deepspeed.save_fp16_model(output_dir, WEIGHTS_NAME) + if not self.deepspeed.save_fp16_model(output_dir, WEIGHTS_NAME): + logger.warning( + "deepspeed.save_fp16_model didn't save the model, since stage3_gather_fp16_weights_on_model_save=false. " + "Saving the full checkpoint instead, use zero_to_fp32.py to recover weights" + ) + self.deepspeed.save_checkpoint(output_dir) elif self.args.should_save: self._save(output_dir)