diff --git a/docs/source/main_classes/trainer.rst b/docs/source/main_classes/trainer.rst
index cdc796c01..b0401750f 100644
--- a/docs/source/main_classes/trainer.rst
+++ b/docs/source/main_classes/trainer.rst
@@ -1507,6 +1507,35 @@ and ``total_num_steps`, ``warmup_max_lr``, ``warmup_num_steps`` and ``total_num_
 
 
 
+
+fp32 Precision
+=======================================================================================================================
+
+Deepspeed supports the full fp32 and the fp16 mixed precision.
+
+Because of the much reduced memory needs and faster speed one gets with the fp16 mixed precision, the only time you
+will want to not use it is when the model you're using doesn't behave well under this training mode. Typically this
+happens when the model wasn't pretrained in the fp16 mixed precision (e.g. often this happens with bf16-pretrained
+models). Such models may overflow or underflow leading to ``NaN`` loss. If this is your case then you will want to use
+the full fp32 mode, by explicitly disabling the otherwise default fp16 mixed precision mode with:
+
+.. code-block:: json
+
+    {
+        "fp16": {
+            "enabled": "false",
+        }
+    }
+
+If you're using the Ampere-architecture based GPU, pytorch version 1.7 and higher will automatically switch to using
+the much more efficient tf32 format for some operations, but the results will still be in fp32. For details and
+benchmarks, please, see `TensorFloat-32(TF32) on Ampere devices
+<https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices>`__. The document includes
+instructions on how to disable this automatic conversion if for some reason you prefer not to use it.
+
+
+
+
 Automatic Mixed Precision
 =======================================================================================================================
 
@@ -1532,11 +1561,6 @@ and the :class:`~transformers.Trainer` will automatically enable or disable it b
 
 This mode gets enabled when ``--fp16 --fp16_backend amp`` command line args are passed.
 
-.. note::
-
-   At the moment DeepSpeed doesn't supported fp32 mode, though it will become available soon. Until then it will be
-   always set to ``true``.
-
 You can also enable/disable this mode explicitly:
 
 .. code-block:: json
@@ -1790,6 +1814,24 @@ stress on ``tensor([1.])``, or if you get an error where it says the parameter i
 larger multi-dimensional shape, this means that the parameter is partitioned and what you see is a ZeRO-3 placeholder.
 
 
+Troubleshooting
+=======================================================================================================================
+
+* ``deepspeed`` process gets killed at startup without a traceback
+
+If the ``deepspeed`` process gets killed at launch time without a traceback, that usually means that the program tried
+to allocate more CPU memory than your system has or your process is allowed to allocate and the OS kernel killed that
+process. This is because your configuration file most likely has either ``offload_optimizer`` or ``offload_param`` or
+both configured to offload to ``cpu`` (or under ZeRO-2 ``cpu_offload`` is enabled). If you have NVMe, experiment with
+offloading to NVMe if you're running under ZeRO-3.
+
+Work is being done to enable estimating how much memory is needed for a specific model: `PR
+<https://github.com/microsoft/DeepSpeed/pull/965>`__.
+
+
+
+
+
 
 Notes
 =======================================================================================================================
diff --git a/setup.py b/setup.py
index 974d5ca40..0942a76f6 100644
--- a/setup.py
+++ b/setup.py
@@ -90,7 +90,7 @@ _deps = [
     "cookiecutter==1.7.2",
     "dataclasses",
     "datasets",
-    "deepspeed>=0.3.15",
+    "deepspeed>=0.3.16",
     "docutils==0.16.0",
     "fairscale>0.3",
     "faiss-cpu",
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index a5c90f86d..811f9d66c 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -7,7 +7,7 @@ deps = {
     "cookiecutter": "cookiecutter==1.7.2",
     "dataclasses": "dataclasses",
     "datasets": "datasets",
-    "deepspeed": "deepspeed>=0.3.15",
+    "deepspeed": "deepspeed>=0.3.16",
     "docutils": "docutils==0.16.0",
     "fairscale": "fairscale>0.3",
     "faiss-cpu": "faiss-cpu",
diff --git a/src/transformers/integrations.py b/src/transformers/integrations.py
index a2d6743a1..4ab15b9d5 100644
--- a/src/transformers/integrations.py
+++ b/src/transformers/integrations.py
@@ -374,10 +374,7 @@ class DeepSpeedConfigHF:
         # amp: similar to the pytorch native amp - it has a bunch of optional params but we won't set
         # any here unless the user did the work
         config_fp16 = config.get("fp16")
-        # XXX: at the moment fp16 can't be False, but the fp32 solution is in works - once it's PR'ed and
-        # merged and a new release is made, delete the next line and uncomment the one after it
-        _set_if_auto(config_fp16, "enabled", True)
-        # _set_if_auto(config_fp16, "enabled", fp16_backend == "amp")
+        _set_if_auto(config_fp16, "enabled", fp16_backend == "amp")
 
         # apex: delegates amp work to apex (which needs to be available), but it cannot be used with any
         # ZeRO features, so probably best to be avoided.
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 7b1f477af..66875a028 100755
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -44,7 +44,7 @@ from .file_utils import (
     replace_return_docstrings,
 )
 from .generation_utils import GenerationMixin
-from .integrations import is_deepspeed_zero3_enabled
+from .integrations import deepspeed_config, is_deepspeed_zero3_enabled
 from .utils import logging
 
 
@@ -1124,10 +1124,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
             logger.info("Detected DeepSpeed ZeRO-3: activating zero.init() for this model")
             # this immediately partitions the model across all gpus, to avoid the overhead in time
             # and memory copying it on CPU or each GPU first
-
-            # XXX: param_dict will be added in deepspeed==0.3.16 and probably replaced by deepspeed_config
-            # with deepspeed.zero.Init(param_dict=deepspeed_config()):
-            with deepspeed.zero.Init():
+            with deepspeed.zero.Init(config=deepspeed_config()):
                 model = cls(config, *model_args, **model_kwargs)
         else:
             model = cls(config, *model_args, **model_kwargs)
diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py
index 52f9bd72f..0c829e593 100644
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -48,6 +48,7 @@ with ExtendSysPath(f"{bindir}/.."):
 set_seed(42)
 MBART_TINY = "sshleifer/tiny-mbart"
 T5_SMALL = "t5-small"
+T5_TINY = "patrickvonplaten/t5-tiny-random"
 
 
 def load_json(path):
@@ -108,25 +109,31 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
             MASTER_ADDR="localhost", MASTER_PORT="10999", RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
         )
 
-        self.ds_config_file = {}
-        self.ds_config_file[ZERO2] = f"{self.test_file_dir_str}/ds_config_zero2.json"
-        self.ds_config_file[ZERO3] = f"{self.test_file_dir_str}/ds_config_zero3.json"
+        self.ds_config_file = dict(
+            zero2=f"{self.test_file_dir_str}/ds_config_zero2.json",
+            zero3=f"{self.test_file_dir_str}/ds_config_zero3.json",
+        )
 
         # use self.get_config_dict(stage) to use these to ensure the original is not modified
-        self.ds_config_dict = {}
         with io.open(self.ds_config_file[ZERO2], "r", encoding="utf-8") as f:
-            self.ds_config_dict[ZERO2] = json.load(f)
+            config_zero2 = json.load(f)
+            # by default use fp16
+            config_zero2["fp16"]["enabled"] = True
         with io.open(self.ds_config_file[ZERO3], "r", encoding="utf-8") as f:
-            self.ds_config_dict[ZERO3] = json.load(f)
-
-    def get_config_dict(self, stage):
-        """As the tests modify the dict, always make a copy"""
-        config = deepcopy(self.ds_config_dict[stage])
-        if stage == ZERO3:
+            config_zero3 = json.load(f)
+            # by default use fp16
+            config_zero3["fp16"]["enabled"] = True
             # This setting slows things down, so don't enable it by default unless needed by a test.
             # It's in the file as a demo for users since we want everything to work out of the box even if slower.
-            config["zero_optimization"]["stage3_gather_fp16_weights_on_model_save"] = False
-        return config
+            config_zero3["zero_optimization"]["stage3_gather_fp16_weights_on_model_save"] = False
+        self.ds_config_dict = dict(
+            zero2=config_zero2,
+            zero3=config_zero3,
+        )
+
+    def get_config_dict(self, stage):
+        # As some tests modify the dict, always make a copy
+        return deepcopy(self.ds_config_dict[stage])
 
     # --- These tests are enough to run on one of zero stages --- #
 
@@ -192,24 +199,6 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
 
     # --- These tests need to run on both zero stages --- #
 
-    @parameterized.expand(stages)
-    def test_fp32(self, stage):
-        ds_config_dict = self.get_config_dict(stage)
-        ds_config_dict["fp16"]["enabled"] = False  # force non-fp16 mode
-
-        # XXX: do we go via from_pretrained in zero 3 here? need to test zero.Init(dtype=torch.float)
-
-        # XXX: rewrite this test once fp32 is supported by DeepSpeed
-        with mockenv_context(**self.dist_env_1_gpu):
-            trainer = get_regression_trainer(local_rank=0, deepspeed=ds_config_dict)
-            with self.assertRaises(Exception) as context:
-                trainer.train()
-            self.assertIn(
-                "ZeRO is only supported if fp16 is enabled",
-                str(context.exception),
-                f"got exception: {context.exception}",
-            )
-
     @parameterized.expand(stages)
     def test_hf_optimizer_with_offload(self, stage):
         # must not allow non-DS optimizer when using ZERO-offload
@@ -239,7 +228,7 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
         # it's run not as a first test as `sys.stdout` will no longer be the same. So we either have
         # to reset `deepspeed_logger.handlers[0].setStream(sys.stdout)` or directly capture from the deepspeed_logger.
         with mockenv_context(**self.dist_env_1_gpu):
-            trainer = get_regression_trainer(local_rank=0, deepspeed=self.ds_config_file[stage])
+            trainer = get_regression_trainer(local_rank=0, deepspeed=self.get_config_dict(stage))
             with CaptureLogger(deepspeed_logger) as cs:
                 trainer.train()
             self.assertIn("DeepSpeed info", cs.out, "expected DeepSpeed logger output but got none")
@@ -259,7 +248,7 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
                 b=b,
                 local_rank=0,
                 train_len=8,
-                deepspeed=self.ds_config_file[stage],
+                deepspeed=self.get_config_dict(stage),
                 per_device_train_batch_size=8,
                 logging_steps=1,
             )
@@ -267,7 +256,11 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
             post_train_a = trainer.model.a.item()
 
             # XXX: for some reason the following check fails with zero3 - not a broken but a
-            # different qualitative outcome - need to investigate at some point
+            # different qualitative outcome - as if optimizer did run
+            # oddly getting 1.0 for both a and b from 0.0 - there is a bug somewhere
+            # print(trainer.model.a.item())
+            # print(trainer.model.b.item())
+            # need to investigate at some point
             if stage == ZERO3:
                 return
 
@@ -298,7 +291,7 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
                 b=b,
                 local_rank=0,
                 train_len=train_len,
-                deepspeed=self.ds_config_file[stage],
+                deepspeed=self.get_config_dict(stage),
                 per_device_train_batch_size=8,
                 gradient_accumulation_steps=1,
             )
@@ -315,7 +308,7 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
                 b=b,
                 local_rank=0,
                 train_len=train_len,
-                deepspeed=self.ds_config_file[stage],
+                deepspeed=self.get_config_dict(stage),
                 per_device_train_batch_size=4,
                 gradient_accumulation_steps=2,
             )
@@ -532,6 +525,35 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
             do_eval=True,
         )
 
+    @parameterized.expand(stages)
+    def test_fp32_non_distributed(self, stage):
+        # real model needs too much GPU memory under stage2+fp32, so using tiny random model here -
+        # therefore no quality checks, just basic completion checks are done
+        self.run_and_check(
+            stage=stage,
+            model_name=T5_TINY,
+            distributed=False,
+            do_train=True,
+            do_eval=True,
+            quality_checks=False,
+            fp16=False,
+        )
+
+    @require_torch_multi_gpu
+    @parameterized.expand(stages)
+    def test_fp32_distributed(self, stage):
+        # real model needs too much GPU memory under stage2+fp32, so using tiny random model here -
+        # therefore no quality checks, just basic completion checks are done
+        self.run_and_check(
+            stage=stage,
+            model_name=T5_TINY,
+            distributed=True,
+            do_train=True,
+            do_eval=True,
+            quality_checks=False,
+            fp16=False,
+        )
+
     @parameterized.expand(stages)
     def test_resume_train_not_from_ds_checkpoint(self, stage):
         # do normal training and then resume not from the deepspeed checkpoint but explicitly from
@@ -550,44 +572,50 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
 
         self.do_checks(output_dir, do_train=do_train, do_eval=do_eval)
 
-    def do_checks(self, output_dir, do_train=True, do_eval=True):
+    def do_checks(self, output_dir, do_train=True, do_eval=True, quality_checks=True):
 
         if do_train:
             train_metrics = load_json(os.path.join(output_dir, "train_results.json"))
             self.assertIn("train_samples_per_second", train_metrics)
-            self.assertGreater(train_metrics["train_samples_per_second"], 0.5)
+            if quality_checks:
+                self.assertGreater(train_metrics["train_samples_per_second"], 0.5)
 
         if do_eval:
             eval_metrics = load_json(os.path.join(output_dir, "eval_results.json"))
             self.assertIn("eval_bleu", eval_metrics)
-            self.assertGreater(eval_metrics["eval_bleu"], 0)
+            if quality_checks:
+                self.assertGreater(eval_metrics["eval_bleu"], 1)
 
     # XXX: need to do better validation beyond just that the run was successful
     def run_and_check(
         self,
         stage,
-        eval_steps=10,
-        distributed=True,
-        do_train=True,
-        do_eval=True,
-        extra_args_str=None,
-        remove_args_str=None,
+        model_name: str = T5_SMALL,
+        eval_steps: int = 10,
+        distributed: bool = True,
+        do_train: bool = True,
+        do_eval: bool = True,
+        quality_checks: bool = True,
+        fp16: bool = True,
+        extra_args_str: str = None,
+        remove_args_str: str = None,
     ):
 
         # we are doing quality testing so using a small real model
         output_dir = self.run_trainer(
             stage=stage,
-            model_name=T5_SMALL,
+            model_name=model_name,
             eval_steps=eval_steps,
             num_train_epochs=1,
             do_train=do_train,
             do_eval=do_eval,
             distributed=distributed,
+            fp16=fp16,
             extra_args_str=extra_args_str,
             remove_args_str=remove_args_str,
         )
 
-        self.do_checks(output_dir, do_train=do_train, do_eval=do_eval)
+        self.do_checks(output_dir, do_train=do_train, do_eval=do_eval, quality_checks=quality_checks)
 
         return output_dir
 
@@ -600,6 +628,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
         do_train: bool = False,
         do_eval: bool = True,
         distributed: bool = True,
+        fp16: bool = True,
         extra_args_str: str = None,
         remove_args_str: str = None,
     ):
@@ -629,6 +658,9 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
         """.split()
         args.extend(["--source_prefix", '"translate English to Romanian: "'])
 
+        if fp16:
+            args.extend(["--fp16"])
+
         actions = 0
         if do_train:
             actions += 1
@@ -636,7 +668,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
                 f"""
             --do_train
             --num_train_epochs {str(num_train_epochs)}
-            --max_train_samples 100
+            --max_train_samples 16
             --per_device_train_batch_size 2
             --learning_rate 3e-3
             """.split()
@@ -647,7 +679,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
             args.extend(
                 """
             --do_eval
-            --max_eval_samples 100
+            --max_eval_samples 16
             --per_device_eval_batch_size 2
             """.split()
             )
@@ -688,13 +720,14 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
             --overwrite_output_dir
             --do_train
             --do_eval
-            --max_train_samples 10
-            --max_eval_samples 10
-            --per_device_train_batch_size 5
-            --per_device_eval_batch_size 5
+            --max_train_samples 16
+            --max_eval_samples 16
+            --per_device_train_batch_size 2
+            --per_device_eval_batch_size 2
             --num_train_epochs 1
             --warmup_steps 8
-            --block_size 128
+            --block_size 64
+            --fp16
             --report_to none
             """.split()