[deepspeed] saving checkpoint fallback when fp16 weights aren't saved (#14948)

* [deepspeed] saving checkpoint fallback when fp16 weights aren't saved

* Bump required deepspeed version to match usage when saving checkpoints

* update version

Co-authored-by: Mihai Balint <balint.mihai@gmail.com>
This commit is contained in:
Stas Bekman 2022-01-28 11:05:47 -08:00 committed by GitHub
parent d25e25ee2b
commit 297602c7f4
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 8 additions and 3 deletions

View file

@ -98,7 +98,7 @@ _deps = [
"cookiecutter==1.7.2",
"dataclasses",
"datasets",
"deepspeed>=0.5.7",
"deepspeed>=0.5.9",
"fairscale>0.3",
"faiss-cpu",
"fastapi",

View file

@ -8,7 +8,7 @@ deps = {
"cookiecutter": "cookiecutter==1.7.2",
"dataclasses": "dataclasses",
"datasets": "datasets",
"deepspeed": "deepspeed>=0.5.7",
"deepspeed": "deepspeed>=0.5.9",
"fairscale": "fairscale>0.3",
"faiss-cpu": "faiss-cpu",
"fastapi": "fastapi",

View file

@ -2054,7 +2054,12 @@ class Trainer:
# now save the real model if stage3_gather_fp16_weights_on_model_save=True
# if false it will not be saved.
# This must be called on all ranks
self.deepspeed.save_fp16_model(output_dir, WEIGHTS_NAME)
if not self.deepspeed.save_fp16_model(output_dir, WEIGHTS_NAME):
logger.warning(
"deepspeed.save_fp16_model didn't save the model, since stage3_gather_fp16_weights_on_model_save=false. "
"Saving the full checkpoint instead, use zero_to_fp32.py to recover weights"
)
self.deepspeed.save_checkpoint(output_dir)
elif self.args.should_save:
self._save(output_dir)