diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py
index 8eaa00bc7..95a803601 100644
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -482,6 +482,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
                 max_grad_norm=max_grad_norm,
                 adam_beta1=adam_beta1,
                 adam_beta2=adam_beta2,
+                output_dir=self.get_auto_remove_tmp_dir(),
             )
             with self.assertRaises(Exception) as context:
                 trainer.train()
@@ -506,7 +507,9 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
             del ds_config_zero2_dict["scheduler"]  # force default HF Trainer scheduler
             ds_config_zero2_dict["zero_optimization"]["offload_optimizer"]["device"] = "none"
             ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
-            trainer = get_regression_trainer(a=a, local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict)
+            trainer = get_regression_trainer(
+                a=a, local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict, output_dir=self.get_auto_remove_tmp_dir()
+            )
             trainer.train()
         new_a = trainer.model.a.item()
         self.assertNotEqual(new_a, a)
@@ -518,7 +521,9 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
             del ds_config_zero2_dict["optimizer"]  # force default HF Trainer optimizer
             ds_config_zero2_dict["zero_optimization"]["offload_optimizer"]["device"] = "none"
             ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
-            trainer = get_regression_trainer(a=a, local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict)
+            trainer = get_regression_trainer(
+                a=a, local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict, output_dir=self.get_auto_remove_tmp_dir()
+            )
             trainer.train()
         new_a = trainer.model.a.item()
         self.assertNotEqual(new_a, a)
@@ -530,7 +535,9 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
             del ds_config_zero2_dict["scheduler"]  # force default HF Trainer scheduler
             ds_config_zero2_dict["zero_optimization"]["offload_optimizer"]["device"] = "none"
             ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
-            trainer = get_regression_trainer(a=a, local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict)
+            trainer = get_regression_trainer(
+                a=a, local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict, output_dir=self.get_auto_remove_tmp_dir()
+            )
             trainer.train()
         new_a = trainer.model.a.item()
         self.assertNotEqual(new_a, a)
@@ -546,7 +553,9 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
             ds_config_zero3_dict["zero_optimization"]["offload_optimizer"] = nvme_config
             ds_config_zero3_dict["zero_optimization"]["offload_param"] = nvme_config
             ds_config_zero3_dict["zero_optimization"]["stage3_gather_16bit_weights_on_model_save"] = True
-            trainer = get_regression_trainer(local_rank=0, fp16=True, deepspeed=ds_config_zero3_dict)
+            trainer = get_regression_trainer(
+                local_rank=0, fp16=True, deepspeed=ds_config_zero3_dict, output_dir=self.get_auto_remove_tmp_dir()
+            )
             with CaptureLogger(deepspeed_logger) as cl:
                 trainer.train()
             self.assertIn("DeepSpeed info", cl.out, "expected DeepSpeed logger output but got none")
@@ -567,6 +576,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
                 fp16=True,
                 model_init=model_init,
                 deepspeed=ds_config_zero3_dict,
+                output_dir=self.get_auto_remove_tmp_dir(),
             )
 
             n_trials = 3
@@ -588,7 +598,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
         ds_config_dict["zero_optimization"]["offload_optimizer"]["device"] = "cpu"
         ds_config_dict["zero_force_ds_cpu_optimizer"] = False  # offload is not efficient w/o CPUAdam
         with mockenv_context(**self.dist_env_1_gpu):
-            kwargs = {"local_rank": 0, "deepspeed": ds_config_dict}
+            kwargs = {"local_rank": 0, "deepspeed": ds_config_dict, "output_dir": self.get_auto_remove_tmp_dir()}
             kwargs[dtype] = True
             trainer = get_regression_trainer(**kwargs)
             with CaptureLogger(deepspeed_logger) as cl:
@@ -604,7 +614,11 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
         # it's run not as a first test as `sys.stdout` will no longer be the same. So we either have
         # to reset `deepspeed_logger.handlers[0].setStream(sys.stdout)` or directly capture from the deepspeed_logger.
         with mockenv_context(**self.dist_env_1_gpu):
-            kwargs = {"local_rank": 0, "deepspeed": self.get_config_dict(stage)}
+            kwargs = {
+                "local_rank": 0,
+                "deepspeed": self.get_config_dict(stage),
+                "output_dir": self.get_auto_remove_tmp_dir(),
+            }
             kwargs[dtype] = True
             trainer = get_regression_trainer(**kwargs)
 
@@ -630,6 +644,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
                 "deepspeed": self.get_config_dict(stage),
                 "per_device_train_batch_size": 8,
                 "logging_steps": 1,
+                "output_dir": self.get_auto_remove_tmp_dir(),
             }
             kwargs[dtype] = True
             trainer = get_regression_trainer(**kwargs)
@@ -673,6 +688,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
             "local_rank": 0,
             "train_len": train_len,
             "deepspeed": self.get_config_dict(stage),
+            "output_dir": self.get_auto_remove_tmp_dir(),
         }
         kwargs[dtype] = True
 
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 145f32599..1703cb400 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -1222,87 +1222,85 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
         train_dataset = RegressionDataset()
         eval_dataset = RegressionDataset()
         model = RegressionDictModel()
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            args = TrainingArguments(tmp_dir, report_to="none")
-            trainer = Trainer(model, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
-            trainer.train()
-            _ = trainer.evaluate()
-            _ = trainer.predict(eval_dataset)
+        args = TrainingArguments(self.get_auto_remove_tmp_dir(), report_to="none")
+        trainer = Trainer(model, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
+        trainer.train()
+        _ = trainer.evaluate()
+        _ = trainer.predict(eval_dataset)
 
     def test_evaluation_with_keys_to_drop(self):
         config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4)
         tiny_gpt2 = GPT2LMHeadModel(config)
         x = torch.randint(0, 100, (128,))
         eval_dataset = RepeatDataset(x)
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            args = TrainingArguments(tmp_dir, report_to="none")
-            trainer = Trainer(tiny_gpt2, args, eval_dataset=eval_dataset)
-            # By default the past_key_values are removed
-            result = trainer.predict(eval_dataset)
-            self.assertTrue(isinstance(result.predictions, np.ndarray))
-            # We can still get them by setting ignore_keys to []
-            result = trainer.predict(eval_dataset, ignore_keys=[])
-            self.assertTrue(isinstance(result.predictions, tuple))
-            self.assertEqual(len(result.predictions), 2)
+        args = TrainingArguments(self.get_auto_remove_tmp_dir(), report_to="none")
+        trainer = Trainer(tiny_gpt2, args, eval_dataset=eval_dataset)
+        # By default the past_key_values are removed
+        result = trainer.predict(eval_dataset)
+        self.assertTrue(isinstance(result.predictions, np.ndarray))
+        # We can still get them by setting ignore_keys to []
+        result = trainer.predict(eval_dataset, ignore_keys=[])
+        self.assertTrue(isinstance(result.predictions, tuple))
+        self.assertEqual(len(result.predictions), 2)
 
     def test_training_arguments_are_left_untouched(self):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            trainer = get_regression_trainer(output_dir=tmp_dir)
-            trainer.train()
-            args = TrainingArguments(tmp_dir, report_to=[])
-            dict1, dict2 = args.to_dict(), trainer.args.to_dict()
-            for key in dict1.keys():
-                # Logging dir can be slightly different as they default to something with the time.
-                if key != "logging_dir":
-                    self.assertEqual(dict1[key], dict2[key])
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        trainer = get_regression_trainer(output_dir=tmp_dir)
+        trainer.train()
+        args = TrainingArguments(tmp_dir, report_to=[])
+        dict1, dict2 = args.to_dict(), trainer.args.to_dict()
+        for key in dict1.keys():
+            # Logging dir can be slightly different as they default to something with the time.
+            if key != "logging_dir":
+                self.assertEqual(dict1[key], dict2[key])
 
     def test_number_of_steps_in_training(self):
         # Regular training has n_epochs * len(train_dl) steps
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            trainer = get_regression_trainer(learning_rate=0.1, output_dir=tmp_dir)
-            train_output = trainer.train()
-            self.assertEqual(train_output.global_step, self.n_epochs * 64 / self.batch_size)
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        trainer = get_regression_trainer(learning_rate=0.1, output_dir=tmp_dir)
+        train_output = trainer.train()
+        self.assertEqual(train_output.global_step, self.n_epochs * 64 / self.batch_size)
 
-            # Check passing num_train_epochs works (and a float version too):
-            trainer = get_regression_trainer(learning_rate=0.1, num_train_epochs=1.5, output_dir=tmp_dir)
-            train_output = trainer.train()
-            self.assertEqual(train_output.global_step, int(1.5 * 64 / self.batch_size))
+        # Check passing num_train_epochs works (and a float version too):
+        trainer = get_regression_trainer(learning_rate=0.1, num_train_epochs=1.5, output_dir=tmp_dir)
+        train_output = trainer.train()
+        self.assertEqual(train_output.global_step, int(1.5 * 64 / self.batch_size))
 
-            # If we pass a max_steps, num_train_epochs is ignored
-            trainer = get_regression_trainer(learning_rate=0.1, max_steps=10, output_dir=tmp_dir)
-            train_output = trainer.train()
-            self.assertEqual(train_output.global_step, 10)
+        # If we pass a max_steps, num_train_epochs is ignored
+        trainer = get_regression_trainer(learning_rate=0.1, max_steps=10, output_dir=tmp_dir)
+        train_output = trainer.train()
+        self.assertEqual(train_output.global_step, 10)
 
     @require_torch_bf16
     @require_intel_extension_for_pytorch
     def test_number_of_steps_in_training_with_ipex(self):
         for mix_bf16 in [True, False]:
-            with tempfile.TemporaryDirectory() as tmp_dir:
-                # Regular training has n_epochs * len(train_dl) steps
-                trainer = get_regression_trainer(
-                    learning_rate=0.1, use_ipex=True, bf16=mix_bf16, use_cpu=True, output_dir=tmp_dir
-                )
-                train_output = trainer.train()
-                self.assertEqual(train_output.global_step, self.n_epochs * 64 / trainer.args.train_batch_size)
+            tmp_dir = self.get_auto_remove_tmp_dir()
+            # Regular training has n_epochs * len(train_dl) steps
+            trainer = get_regression_trainer(
+                learning_rate=0.1, use_ipex=True, bf16=mix_bf16, use_cpu=True, output_dir=tmp_dir
+            )
+            train_output = trainer.train()
+            self.assertEqual(train_output.global_step, self.n_epochs * 64 / trainer.args.train_batch_size)
 
-                # Check passing num_train_epochs works (and a float version too):
-                trainer = get_regression_trainer(
-                    learning_rate=0.1,
-                    num_train_epochs=1.5,
-                    use_ipex=True,
-                    bf16=mix_bf16,
-                    use_cpu=True,
-                    output_dir=tmp_dir,
-                )
-                train_output = trainer.train()
-                self.assertEqual(train_output.global_step, int(1.5 * 64 / trainer.args.train_batch_size))
+            # Check passing num_train_epochs works (and a float version too):
+            trainer = get_regression_trainer(
+                learning_rate=0.1,
+                num_train_epochs=1.5,
+                use_ipex=True,
+                bf16=mix_bf16,
+                use_cpu=True,
+                output_dir=tmp_dir,
+            )
+            train_output = trainer.train()
+            self.assertEqual(train_output.global_step, int(1.5 * 64 / trainer.args.train_batch_size))
 
-                # If we pass a max_steps, num_train_epochs is ignored
-                trainer = get_regression_trainer(
-                    learning_rate=0.1, max_steps=10, use_ipex=True, bf16=mix_bf16, use_cpu=True, output_dir=tmp_dir
-                )
-                train_output = trainer.train()
-                self.assertEqual(train_output.global_step, 10)
+            # If we pass a max_steps, num_train_epochs is ignored
+            trainer = get_regression_trainer(
+                learning_rate=0.1, max_steps=10, use_ipex=True, bf16=mix_bf16, use_cpu=True, output_dir=tmp_dir
+            )
+            train_output = trainer.train()
+            self.assertEqual(train_output.global_step, 10)
 
     def test_torch_compile_loss_func_compatibility(self):
         config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4)
@@ -1311,15 +1309,14 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
         x = torch.randint(0, 100, (128,))
         train_dataset = RepeatDataset(x)
 
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            args = TrainingArguments(
-                tmp_dir,
-                per_device_train_batch_size=2,
-                torch_compile=True,
-                max_steps=1,  # compile happens on the first step
-            )
-            trainer = Trainer(model=tiny_llama, args=args, train_dataset=train_dataset)  # noqa
-            trainer.train()
+        args = TrainingArguments(
+            self.get_auto_remove_tmp_dir(),
+            per_device_train_batch_size=2,
+            torch_compile=True,
+            max_steps=1,  # compile happens on the first step
+        )
+        trainer = Trainer(model=tiny_llama, args=args, train_dataset=train_dataset)  # noqa
+        trainer.train()
 
     @require_peft
     @require_bitsandbytes
@@ -1348,14 +1345,13 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
         x = torch.randint(0, 100, (128,))
         train_dataset = RepeatDataset(x)
 
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            args = TrainingArguments(
-                tmp_dir,
-                learning_rate=1e-9,
-                logging_steps=5,
-            )
-            with self.assertRaises(ValueError):
-                _ = Trainer(tiny_model, args, train_dataset=train_dataset)  # noqa
+        args = TrainingArguments(
+            self.get_auto_remove_tmp_dir(),
+            learning_rate=1e-9,
+            logging_steps=5,
+        )
+        with self.assertRaises(ValueError):
+            _ = Trainer(tiny_model, args, train_dataset=train_dataset)  # noqa
 
     @require_peft
     def test_multiple_peft_adapters(self):
@@ -1387,32 +1383,32 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
 
         tokenizer.pad_token = tokenizer.eos_token
 
-        with tempfile.TemporaryDirectory() as tmpdir:
-            args = TrainingArguments(
-                tmpdir,
-                per_device_train_batch_size=1,
-                learning_rate=1e-9,
-                save_steps=5,
-                logging_steps=5,
-                max_steps=10,
-                use_cpu=True,
-            )
-            trainer = Trainer(tiny_model, args, processing_class=tokenizer, train_dataset=train_dataset)
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        args = TrainingArguments(
+            tmp_dir,
+            per_device_train_batch_size=1,
+            learning_rate=1e-9,
+            save_steps=5,
+            logging_steps=5,
+            max_steps=10,
+            use_cpu=True,
+        )
+        trainer = Trainer(tiny_model, args, processing_class=tokenizer, train_dataset=train_dataset)
 
-            trainer.train()
-            parameters = dict(tiny_model.named_parameters())
-            state = dataclasses.asdict(trainer.state)
+        trainer.train()
+        parameters = dict(tiny_model.named_parameters())
+        state = dataclasses.asdict(trainer.state)
 
-            # Reinitialize trainer
-            trainer = Trainer(tiny_model, args, processing_class=tokenizer, train_dataset=train_dataset)
+        # Reinitialize trainer
+        trainer = Trainer(tiny_model, args, processing_class=tokenizer, train_dataset=train_dataset)
 
-            checkpoint = os.path.join(tmpdir, "checkpoint-5")
+        checkpoint = os.path.join(tmp_dir, "checkpoint-5")
 
-            trainer.train(resume_from_checkpoint=checkpoint)
-            parameters1 = dict(tiny_model.named_parameters())
-            state1 = dataclasses.asdict(trainer.state)
-            self.assertEqual(parameters, parameters1)
-            self.check_trainer_state_are_the_same(state, state1)
+        trainer.train(resume_from_checkpoint=checkpoint)
+        parameters1 = dict(tiny_model.named_parameters())
+        state1 = dataclasses.asdict(trainer.state)
+        self.assertEqual(parameters, parameters1)
+        self.check_trainer_state_are_the_same(state, state1)
 
     @require_bitsandbytes
     def test_rmsprop_bnb(self):
@@ -1421,15 +1417,18 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
         x = torch.randint(0, 100, (128,))
         train_dataset = RepeatDataset(x)
 
-        with tempfile.TemporaryDirectory() as tmpdir:
-            # Trainer without inf/nan filter
-            args = TrainingArguments(
-                tmpdir, learning_rate=1e-9, logging_steps=5, logging_nan_inf_filter=False, optim="rmsprop_bnb"
-            )
-            trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
+        # Trainer without inf/nan filter
+        args = TrainingArguments(
+            self.get_auto_remove_tmp_dir(),
+            learning_rate=1e-9,
+            logging_steps=5,
+            logging_nan_inf_filter=False,
+            optim="rmsprop_bnb",
+        )
+        trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
 
-            # Check that it trains without errors
-            trainer.train()
+        # Check that it trains without errors
+        trainer.train()
 
     @require_bitsandbytes
     def test_ademamix_bnb(self):
@@ -1438,15 +1437,18 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
         x = torch.randint(0, 100, (128,))
         train_dataset = RepeatDataset(x)
 
-        with tempfile.TemporaryDirectory() as tmpdir:
-            # Trainer without inf/nan filter
-            args = TrainingArguments(
-                tmpdir, learning_rate=1e-9, logging_steps=5, logging_nan_inf_filter=False, optim="ademamix"
-            )
-            trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
+        # Trainer without inf/nan filter
+        args = TrainingArguments(
+            self.get_auto_remove_tmp_dir(),
+            learning_rate=1e-9,
+            logging_steps=5,
+            logging_nan_inf_filter=False,
+            optim="ademamix",
+        )
+        trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
 
-            # Check that it trains without errors
-            trainer.train()
+        # Check that it trains without errors
+        trainer.train()
 
     @require_bitsandbytes
     def test_ademamix_bnb_8bit(self):
@@ -1455,15 +1457,18 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
         x = torch.randint(0, 100, (128,))
         train_dataset = RepeatDataset(x)
 
-        with tempfile.TemporaryDirectory() as tmpdir:
-            # Trainer without inf/nan filter
-            args = TrainingArguments(
-                tmpdir, learning_rate=1e-9, logging_steps=5, logging_nan_inf_filter=False, optim="ademamix_8bit"
-            )
-            trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
+        # Trainer without inf/nan filter
+        args = TrainingArguments(
+            self.get_auto_remove_tmp_dir(),
+            learning_rate=1e-9,
+            logging_steps=5,
+            logging_nan_inf_filter=False,
+            optim="ademamix_8bit",
+        )
+        trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
 
-            # Check that it trains without errors
-            trainer.train()
+        # Check that it trains without errors
+        trainer.train()
 
     @require_bitsandbytes
     def test_rmsprop_bnb_8bit(self):
@@ -1472,15 +1477,18 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
         x = torch.randint(0, 100, (128,))
         train_dataset = RepeatDataset(x)
 
-        with tempfile.TemporaryDirectory() as tmpdir:
-            # Trainer without inf/nan filter
-            args = TrainingArguments(
-                tmpdir, learning_rate=1e-9, logging_steps=5, logging_nan_inf_filter=False, optim="rmsprop_bnb_8bit"
-            )
-            trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
+        # Trainer without inf/nan filter
+        args = TrainingArguments(
+            self.get_auto_remove_tmp_dir(),
+            learning_rate=1e-9,
+            logging_steps=5,
+            logging_nan_inf_filter=False,
+            optim="rmsprop_bnb_8bit",
+        )
+        trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
 
-            # Check that it trains without errors
-            trainer.train()
+        # Check that it trains without errors
+        trainer.train()
 
     @require_bitsandbytes
     def test_rmsprop_bnb_32bit(self):
@@ -1488,15 +1496,18 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
         tiny_gpt2 = GPT2LMHeadModel(config)
         x = torch.randint(0, 100, (128,))
         train_dataset = RepeatDataset(x)
-        with tempfile.TemporaryDirectory() as tmpdir:
-            # Trainer without inf/nan filter
-            args = TrainingArguments(
-                tmpdir, learning_rate=1e-9, logging_steps=5, logging_nan_inf_filter=False, optim="rmsprop_bnb_32bit"
-            )
-            trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
+        # Trainer without inf/nan filter
+        args = TrainingArguments(
+            self.get_auto_remove_tmp_dir(),
+            learning_rate=1e-9,
+            logging_steps=5,
+            logging_nan_inf_filter=False,
+            optim="rmsprop_bnb_32bit",
+        )
+        trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
 
-            # Check that it trains without errors
-            trainer.train()
+        # Check that it trains without errors
+        trainer.train()
 
     def test_neftune(self):
         config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4)
@@ -1505,54 +1516,52 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
         train_dataset = RepeatDataset(x)
 
         # Trainer without inf/nan filter
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            args = TrainingArguments(
-                tmp_dir,
-                learning_rate=1e-9,
-                logging_steps=5,
-                logging_nan_inf_filter=False,
-                neftune_noise_alpha=0.4,
-                report_to="none",
-            )
-            trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
+        args = TrainingArguments(
+            self.get_auto_remove_tmp_dir(),
+            learning_rate=1e-9,
+            logging_steps=5,
+            logging_nan_inf_filter=False,
+            neftune_noise_alpha=0.4,
+            report_to="none",
+        )
+        trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
 
-            trainer.model = trainer._activate_neftune(trainer.model)
+        trainer.model = trainer._activate_neftune(trainer.model)
 
-            dummy_input = torch.LongTensor([[1, 0, 1]]).to(torch_device)
+        dummy_input = torch.LongTensor([[1, 0, 1]]).to(torch_device)
 
-            emb1 = trainer.model.get_input_embeddings()(dummy_input)
-            emb2 = trainer.model.get_input_embeddings()(dummy_input)
+        emb1 = trainer.model.get_input_embeddings()(dummy_input)
+        emb2 = trainer.model.get_input_embeddings()(dummy_input)
 
-            self.assertFalse(torch.allclose(emb1, emb2), "Neftune noise is not applied!")
+        self.assertFalse(torch.allclose(emb1, emb2), "Neftune noise is not applied!")
 
         # redefine the model
         tiny_gpt2 = GPT2LMHeadModel(config)
         # Trainer without inf/nan filter
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            args = TrainingArguments(
-                tmp_dir,
-                learning_rate=1e-9,
-                logging_steps=5,
-                logging_nan_inf_filter=False,
-                neftune_noise_alpha=0.4,
-                report_to="none",
-            )
-            trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
+        args = TrainingArguments(
+            self.get_auto_remove_tmp_dir(),
+            learning_rate=1e-9,
+            logging_steps=5,
+            logging_nan_inf_filter=False,
+            neftune_noise_alpha=0.4,
+            report_to="none",
+        )
+        trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
 
-            # Check that it trains without errors
-            trainer.train()
+        # Check that it trains without errors
+        trainer.train()
 
-            # Make sure forward pass works fine
-            _ = trainer.model(dummy_input)
-            self.assertTrue(len(trainer.model.get_input_embeddings()._forward_hooks) == 0)
+        # Make sure forward pass works fine
+        _ = trainer.model(dummy_input)
+        self.assertTrue(len(trainer.model.get_input_embeddings()._forward_hooks) == 0)
 
-            trainer.model.eval()
+        trainer.model.eval()
 
-            # Check that we get identical embeddings just in case
-            emb1 = trainer.model.get_input_embeddings()(dummy_input)
-            emb2 = trainer.model.get_input_embeddings()(dummy_input)
+        # Check that we get identical embeddings just in case
+        emb1 = trainer.model.get_input_embeddings()(dummy_input)
+        emb2 = trainer.model.get_input_embeddings()(dummy_input)
 
-            self.assertTrue(torch.allclose(emb1, emb2), "Neftune noise is still applied!")
+        self.assertTrue(torch.allclose(emb1, emb2), "Neftune noise is still applied!")
 
     def test_logging_inf_nan_filter(self):
         config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4)
@@ -1561,29 +1570,35 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
         train_dataset = RepeatDataset(x)
 
         # Trainer without inf/nan filter
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            args = TrainingArguments(
-                tmp_dir, learning_rate=1e9, logging_steps=5, logging_nan_inf_filter=False, report_to="none"
-            )
-            trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
-            trainer.train()
-            log_history_no_filter = trainer.state.log_history
+        args = TrainingArguments(
+            self.get_auto_remove_tmp_dir(),
+            learning_rate=1e9,
+            logging_steps=5,
+            logging_nan_inf_filter=False,
+            report_to="none",
+        )
+        trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
+        trainer.train()
+        log_history_no_filter = trainer.state.log_history
 
         # Trainer with inf/nan filter
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            args = TrainingArguments(
-                tmp_dir, learning_rate=1e9, logging_steps=5, logging_nan_inf_filter=True, report_to="none"
-            )
-            trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
-            trainer.train()
-            log_history_filter = trainer.state.log_history
+        args = TrainingArguments(
+            self.get_auto_remove_tmp_dir(),
+            learning_rate=1e9,
+            logging_steps=5,
+            logging_nan_inf_filter=True,
+            report_to="none",
+        )
+        trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
+        trainer.train()
+        log_history_filter = trainer.state.log_history
 
-            def is_any_loss_nan_or_inf(log_history):
-                losses = [l["loss"] for l in log_history[:-1]]
-                return any(math.isnan(x) for x in losses) or any(math.isinf(x) for x in losses)
+        def is_any_loss_nan_or_inf(log_history):
+            losses = [l["loss"] for l in log_history[:-1]]
+            return any(math.isnan(x) for x in losses) or any(math.isinf(x) for x in losses)
 
-            self.assertTrue(is_any_loss_nan_or_inf(log_history_no_filter))
-            self.assertFalse(is_any_loss_nan_or_inf(log_history_filter))
+        self.assertTrue(is_any_loss_nan_or_inf(log_history_no_filter))
+        self.assertFalse(is_any_loss_nan_or_inf(log_history_filter))
 
     def test_train_and_eval_dataloaders(self):
         if torch_device == "cuda":
@@ -1591,146 +1606,143 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
         else:
             n_gpu = 1
 
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            trainer = get_regression_trainer(learning_rate=0.1, per_device_train_batch_size=16, output_dir=tmp_dir)
-            self.assertEqual(trainer.get_train_dataloader().total_batch_size, 16 * n_gpu)
-            trainer = get_regression_trainer(learning_rate=0.1, per_device_eval_batch_size=16, output_dir=tmp_dir)
-            self.assertEqual(trainer.get_eval_dataloader().total_batch_size, 16 * n_gpu)
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        trainer = get_regression_trainer(learning_rate=0.1, per_device_train_batch_size=16, output_dir=tmp_dir)
+        self.assertEqual(trainer.get_train_dataloader().total_batch_size, 16 * n_gpu)
+        trainer = get_regression_trainer(learning_rate=0.1, per_device_eval_batch_size=16, output_dir=tmp_dir)
+        self.assertEqual(trainer.get_eval_dataloader().total_batch_size, 16 * n_gpu)
 
-            # Check drop_last works
-            trainer = get_regression_trainer(
-                train_len=66,
-                eval_len=74,
-                learning_rate=0.1,
-                per_device_train_batch_size=16,
-                per_device_eval_batch_size=32,
-                output_dir=tmp_dir,
-            )
-            self.assertEqual(len(trainer.get_train_dataloader()), 66 // (16 * n_gpu) + 1)
-            self.assertEqual(len(trainer.get_eval_dataloader()), 74 // (32 * n_gpu) + 1)
+        # Check drop_last works
+        trainer = get_regression_trainer(
+            train_len=66,
+            eval_len=74,
+            learning_rate=0.1,
+            per_device_train_batch_size=16,
+            per_device_eval_batch_size=32,
+            output_dir=tmp_dir,
+        )
+        self.assertEqual(len(trainer.get_train_dataloader()), 66 // (16 * n_gpu) + 1)
+        self.assertEqual(len(trainer.get_eval_dataloader()), 74 // (32 * n_gpu) + 1)
 
-            trainer = get_regression_trainer(
-                train_len=66,
-                eval_len=74,
-                learning_rate=0.1,
-                per_device_train_batch_size=16,
-                per_device_eval_batch_size=32,
-                dataloader_drop_last=True,
-                output_dir=tmp_dir,
-            )
-            self.assertEqual(len(trainer.get_train_dataloader()), 66 // (16 * n_gpu))
-            self.assertEqual(len(trainer.get_eval_dataloader()), 74 // (32 * n_gpu))
+        trainer = get_regression_trainer(
+            train_len=66,
+            eval_len=74,
+            learning_rate=0.1,
+            per_device_train_batch_size=16,
+            per_device_eval_batch_size=32,
+            dataloader_drop_last=True,
+            output_dir=tmp_dir,
+        )
+        self.assertEqual(len(trainer.get_train_dataloader()), 66 // (16 * n_gpu))
+        self.assertEqual(len(trainer.get_eval_dataloader()), 74 // (32 * n_gpu))
 
-            # Check passing a new dataset for evaluation works
-            new_eval_dataset = RegressionDataset(length=128)
-            self.assertEqual(len(trainer.get_eval_dataloader(new_eval_dataset)), 128 // (32 * n_gpu))
+        # Check passing a new dataset for evaluation works
+        new_eval_dataset = RegressionDataset(length=128)
+        self.assertEqual(len(trainer.get_eval_dataloader(new_eval_dataset)), 128 // (32 * n_gpu))
 
     # tests that we do not require dataloader to have a .dataset attribute
     def test_dataloader_without_dataset(self):
         train_dataset = RegressionDataset(length=128)
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            trainer = CustomDataloaderTrainer(
-                model=RegressionModel(),
-                train_dataset=train_dataset,
-                eval_dataset=train_dataset,
-                args=TrainingArguments(output_dir=tmp_dir, report_to="none"),
-            )
+        trainer = CustomDataloaderTrainer(
+            model=RegressionModel(),
+            train_dataset=train_dataset,
+            eval_dataset=train_dataset,
+            args=TrainingArguments(output_dir=self.get_auto_remove_tmp_dir(), report_to="none"),
+        )
 
-            trainer.train()
-            trainer.evaluate()
+        trainer.train()
+        trainer.evaluate()
 
     def test_get_eval_dataloader_without_persistent_workers(self):
         train_dataset = RegressionDataset()
         config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4)
         tiny_gpt2 = GPT2LMHeadModel(config)
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            args = TrainingArguments(tmp_dir, report_to="none", dataloader_persistent_workers=False)
+        args = TrainingArguments(self.get_auto_remove_tmp_dir(), report_to="none", dataloader_persistent_workers=False)
 
-            # Single evaluation dataset
-            eval_dataset = RegressionDataset()
-            trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
-            # Mocking the prepare method to avoid the dataloader changing with each call to get_eval_dataloader
-            trainer.accelerator.prepare = lambda x: x
+        # Single evaluation dataset
+        eval_dataset = RegressionDataset()
+        trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
+        # Mocking the prepare method to avoid the dataloader changing with each call to get_eval_dataloader
+        trainer.accelerator.prepare = lambda x: x
 
-            default_dataloader = trainer.get_eval_dataloader()
-            dataloader_with_dataset = trainer.get_eval_dataloader(eval_dataset)
+        default_dataloader = trainer.get_eval_dataloader()
+        dataloader_with_dataset = trainer.get_eval_dataloader(eval_dataset)
 
-            self.assertEqual(default_dataloader.dataset, eval_dataset)
-            self.assertEqual(dataloader_with_dataset.dataset, eval_dataset)
-            self.assertNotEqual(default_dataloader, dataloader_with_dataset)
+        self.assertEqual(default_dataloader.dataset, eval_dataset)
+        self.assertEqual(dataloader_with_dataset.dataset, eval_dataset)
+        self.assertNotEqual(default_dataloader, dataloader_with_dataset)
 
-            # Multiple evaluation datasets
-            first_dataset = RegressionDataset()
-            second_dataset = RegressionDataset()
-            trainer = Trainer(
-                tiny_gpt2,
-                args,
-                train_dataset=train_dataset,
-                eval_dataset={"first": first_dataset, "second": second_dataset},
-            )
-            # Mocking the prepare method to avoid the dataloader changing with each call to get_eval_dataloader
-            trainer.accelerator.prepare = lambda x: x
+        # Multiple evaluation datasets
+        first_dataset = RegressionDataset()
+        second_dataset = RegressionDataset()
+        trainer = Trainer(
+            tiny_gpt2,
+            args,
+            train_dataset=train_dataset,
+            eval_dataset={"first": first_dataset, "second": second_dataset},
+        )
+        # Mocking the prepare method to avoid the dataloader changing with each call to get_eval_dataloader
+        trainer.accelerator.prepare = lambda x: x
 
-            first_dataloader = trainer.get_eval_dataloader("first")
-            first_dataloader_repeated = trainer.get_eval_dataloader("first")
-            second_dataloader = trainer.get_eval_dataloader("second")
-            second_dataloader_repeated = trainer.get_eval_dataloader("second")
+        first_dataloader = trainer.get_eval_dataloader("first")
+        first_dataloader_repeated = trainer.get_eval_dataloader("first")
+        second_dataloader = trainer.get_eval_dataloader("second")
+        second_dataloader_repeated = trainer.get_eval_dataloader("second")
 
-            self.assertEqual(first_dataset, first_dataloader.dataset)
-            self.assertEqual(first_dataloader.dataset, first_dataloader_repeated.dataset)
-            self.assertEqual(second_dataset, second_dataloader.dataset)
-            self.assertEqual(second_dataloader.dataset, second_dataloader_repeated.dataset)
-            self.assertNotEqual(first_dataloader, first_dataloader_repeated)
-            self.assertNotEqual(second_dataloader, second_dataloader_repeated)
+        self.assertEqual(first_dataset, first_dataloader.dataset)
+        self.assertEqual(first_dataloader.dataset, first_dataloader_repeated.dataset)
+        self.assertEqual(second_dataset, second_dataloader.dataset)
+        self.assertEqual(second_dataloader.dataset, second_dataloader_repeated.dataset)
+        self.assertNotEqual(first_dataloader, first_dataloader_repeated)
+        self.assertNotEqual(second_dataloader, second_dataloader_repeated)
 
     def test_get_eval_dataloader_with_persistent_workers(self):
         train_dataset = RegressionDataset()
         config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4)
         tiny_gpt2 = GPT2LMHeadModel(config)
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            args = TrainingArguments(
-                tmp_dir,
-                report_to="none",
-                dataloader_persistent_workers=True,
-                dataloader_num_workers=2,
-            )
+        args = TrainingArguments(
+            self.get_auto_remove_tmp_dir(),
+            report_to="none",
+            dataloader_persistent_workers=True,
+            dataloader_num_workers=2,
+        )
 
-            # Single evaluation dataset
-            eval_dataset = RegressionDataset()
-            trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
-            # Mocking the prepare method to avoid the dataloader changing with each call to get_eval_dataloader
-            trainer.accelerator.prepare = lambda x: x
+        # Single evaluation dataset
+        eval_dataset = RegressionDataset()
+        trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
+        # Mocking the prepare method to avoid the dataloader changing with each call to get_eval_dataloader
+        trainer.accelerator.prepare = lambda x: x
 
-            default_dataloader = trainer.get_eval_dataloader()
-            dataloader_with_dataset = trainer.get_eval_dataloader(eval_dataset)
+        default_dataloader = trainer.get_eval_dataloader()
+        dataloader_with_dataset = trainer.get_eval_dataloader(eval_dataset)
 
-            self.assertEqual(default_dataloader.dataset, eval_dataset)
-            self.assertEqual(dataloader_with_dataset.dataset, eval_dataset)
-            self.assertEqual(default_dataloader, dataloader_with_dataset)
+        self.assertEqual(default_dataloader.dataset, eval_dataset)
+        self.assertEqual(dataloader_with_dataset.dataset, eval_dataset)
+        self.assertEqual(default_dataloader, dataloader_with_dataset)
 
-            # Multiple evaluation datasets
-            first_dataset = RegressionDataset()
-            second_dataset = RegressionDataset()
-            trainer = Trainer(
-                tiny_gpt2,
-                args,
-                train_dataset=train_dataset,
-                eval_dataset={"first": first_dataset, "second": second_dataset},
-            )
-            # Mocking the prepare method to avoid the dataloader changing with each call to get_eval_dataloader
-            trainer.accelerator.prepare = lambda x: x
+        # Multiple evaluation datasets
+        first_dataset = RegressionDataset()
+        second_dataset = RegressionDataset()
+        trainer = Trainer(
+            tiny_gpt2,
+            args,
+            train_dataset=train_dataset,
+            eval_dataset={"first": first_dataset, "second": second_dataset},
+        )
+        # Mocking the prepare method to avoid the dataloader changing with each call to get_eval_dataloader
+        trainer.accelerator.prepare = lambda x: x
 
-            first_dataloader = trainer.get_eval_dataloader("first")
-            first_dataloader_repeated = trainer.get_eval_dataloader("first")
-            second_dataloader = trainer.get_eval_dataloader("second")
-            second_dataloader_repeated = trainer.get_eval_dataloader("second")
+        first_dataloader = trainer.get_eval_dataloader("first")
+        first_dataloader_repeated = trainer.get_eval_dataloader("first")
+        second_dataloader = trainer.get_eval_dataloader("second")
+        second_dataloader_repeated = trainer.get_eval_dataloader("second")
 
-            self.assertEqual(first_dataset, first_dataloader.dataset)
-            self.assertEqual(first_dataloader.dataset, first_dataloader_repeated.dataset)
-            self.assertEqual(second_dataset, second_dataloader.dataset)
-            self.assertEqual(second_dataloader.dataset, second_dataloader_repeated.dataset)
-            self.assertEqual(first_dataloader, first_dataloader_repeated)
-            self.assertEqual(second_dataloader, second_dataloader_repeated)
+        self.assertEqual(first_dataset, first_dataloader.dataset)
+        self.assertEqual(first_dataloader.dataset, first_dataloader_repeated.dataset)
+        self.assertEqual(second_dataset, second_dataloader.dataset)
+        self.assertEqual(second_dataloader.dataset, second_dataloader_repeated.dataset)
+        self.assertEqual(first_dataloader, first_dataloader_repeated)
+        self.assertEqual(second_dataloader, second_dataloader_repeated)
 
     @require_liger_kernel
     def test_use_liger_kernel_patching(self):
@@ -1747,16 +1759,15 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
             self.assertNotEqual(modeling_llama.apply_rotary_pos_emb, liger_rotary_pos_emb)
             self.assertFalse(isinstance(tiny_llama.model.norm, LigerRMSNorm))
 
-            with tempfile.TemporaryDirectory() as tmp_dir:
-                args = TrainingArguments(
-                    tmp_dir,
-                    use_liger_kernel=True,
-                )
-                Trainer(tiny_llama, args)
+            args = TrainingArguments(
+                self.get_auto_remove_tmp_dir(),
+                use_liger_kernel=True,
+            )
+            Trainer(tiny_llama, args)
 
-                # Spot check that modeling code and model instance variables are patched
-                self.assertEqual(modeling_llama.apply_rotary_pos_emb, liger_rotary_pos_emb)
-                self.assertTrue(isinstance(tiny_llama.model.norm, LigerRMSNorm))
+            # Spot check that modeling code and model instance variables are patched
+            self.assertEqual(modeling_llama.apply_rotary_pos_emb, liger_rotary_pos_emb)
+            self.assertTrue(isinstance(tiny_llama.model.norm, LigerRMSNorm))
 
     @require_liger_kernel
     @require_torch_gpu
@@ -1768,12 +1779,13 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
         x = torch.randint(0, 100, (128,))
         train_dataset = RepeatDataset(x)
 
-        with tempfile.TemporaryDirectory() as tmpdir:
-            args = TrainingArguments(tmpdir, learning_rate=1e-2, logging_steps=5, max_steps=20, use_liger_kernel=True)
-            trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
+        args = TrainingArguments(
+            self.get_auto_remove_tmp_dir(), learning_rate=1e-2, logging_steps=5, max_steps=20, use_liger_kernel=True
+        )
+        trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
 
-            # Check this works
-            _ = trainer.train()
+        # Check this works
+        _ = trainer.train()
 
     @require_lomo
     @require_torch_gpu
@@ -1786,13 +1798,14 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
         x = torch.randint(0, 100, (128,))
         train_dataset = RepeatDataset(x)
 
-        with tempfile.TemporaryDirectory() as tmpdir:
-            # Trainer without inf/nan filter
-            args = TrainingArguments(tmpdir, learning_rate=1e-2, logging_steps=5, optim="lomo", max_steps=20)
-            trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
+        # Trainer without inf/nan filter
+        args = TrainingArguments(
+            self.get_auto_remove_tmp_dir(), learning_rate=1e-2, logging_steps=5, optim="lomo", max_steps=20
+        )
+        trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
 
-            # Check this works
-            _ = trainer.train()
+        # Check this works
+        _ = trainer.train()
 
         for name, param in tiny_llama.named_parameters():
             self.assertFalse(torch.allclose(param, previous_params[name].to(param.device), rtol=1e-12, atol=1e-12))
@@ -1805,40 +1818,38 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
         x = torch.randint(0, 100, (128,))
         train_dataset = RepeatDataset(x)
 
-        with tempfile.TemporaryDirectory() as tmpdir:
-            # Trainer without inf/nan filter
-            args = TrainingArguments(
-                tmpdir,
-                learning_rate=1e-9,
-                logging_steps=5,
-                optim="adalomo",
-            )
-            trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
+        # Trainer without inf/nan filter
+        args = TrainingArguments(
+            self.get_auto_remove_tmp_dir(),
+            learning_rate=1e-9,
+            logging_steps=5,
+            optim="adalomo",
+        )
+        trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
 
-            # Check this works
-            _ = trainer.train()
+        # Check this works
+        _ = trainer.train()
 
     @require_grokadamw
     @require_torch_gpu
-    def test_grokadamw():
+    def test_grokadamw(self):
         config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4)
         tiny_llama = LlamaForCausalLM(config)
         x = torch.randint(0, 100, (128,))
         train_dataset = RepeatDataset(x)
 
-        with tempfile.TemporaryDirectory() as tmpdir:
-            # Trainer without inf/nan filter
-            args = TrainingArguments(
-                tmpdir,
-                learning_rate=2e-5,
-                logging_steps=5,
-                optim="grokadamw",
-                max_steps=20,
-            )
-            trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
+        # Trainer without inf/nan filter
+        args = TrainingArguments(
+            self.get_auto_remove_tmp_dir(),
+            learning_rate=2e-5,
+            logging_steps=5,
+            optim="grokadamw",
+            max_steps=20,
+        )
+        trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
 
-            # Check this works
-            _ = trainer.train()
+        # Check this works
+        _ = trainer.train()
 
     @require_schedulefree
     @require_torch_gpu
@@ -1848,18 +1859,17 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
         x = torch.randint(0, 100, (128,))
         train_dataset = RepeatDataset(x)
 
-        with tempfile.TemporaryDirectory() as tmpdir:
-            # Trainer without inf/nan filter
-            args = TrainingArguments(
-                tmpdir,
-                learning_rate=1e-9,
-                logging_steps=5,
-                optim="schedule_free_adamw",
-            )
-            trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
+        # Trainer without inf/nan filter
+        args = TrainingArguments(
+            self.get_auto_remove_tmp_dir(),
+            learning_rate=1e-9,
+            logging_steps=5,
+            optim="schedule_free_adamw",
+        )
+        trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
 
-            # Check this works
-            _ = trainer.train()
+        # Check this works
+        _ = trainer.train()
 
     def test_galore_matched_modules(self):
         regex_patterns = [r".*.attn.*", r".*.mlp.*"]
@@ -1950,19 +1960,18 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
         x = torch.randint(0, 100, (128,))
         train_dataset = RepeatDataset(x)
 
-        with tempfile.TemporaryDirectory() as tmpdir:
-            # Trainer without inf/nan filter
-            args = TrainingArguments(
-                tmpdir,
-                learning_rate=1e-9,
-                logging_steps=5,
-                optim="galore_adamw",
-                optim_target_modules=[r".*attn.*", r".*mlp.*"],
-            )
-            trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
+        # Trainer without inf/nan filter
+        args = TrainingArguments(
+            self.get_auto_remove_tmp_dir(),
+            learning_rate=1e-9,
+            logging_steps=5,
+            optim="galore_adamw",
+            optim_target_modules=[r".*attn.*", r".*mlp.*"],
+        )
+        trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
 
-            # Check this works
-            _ = trainer.train()
+        # Check this works
+        _ = trainer.train()
 
     @require_galore_torch
     @require_torch_gpu
@@ -1972,20 +1981,19 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
         x = torch.randint(0, 100, (128,))
         train_dataset = RepeatDataset(x)
 
-        with tempfile.TemporaryDirectory() as tmpdir:
-            # Trainer without inf/nan filter
-            args = TrainingArguments(
-                tmpdir,
-                learning_rate=1e-9,
-                logging_steps=5,
-                optim="galore_adamw",
-                optim_args="rank=64, update_proj_gap=100, scale=0.10",
-                optim_target_modules=[r".*attn.*", r".*mlp.*"],
-            )
-            trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
+        # Trainer without inf/nan filter
+        args = TrainingArguments(
+            self.get_auto_remove_tmp_dir(),
+            learning_rate=1e-9,
+            logging_steps=5,
+            optim="galore_adamw",
+            optim_args="rank=64, update_proj_gap=100, scale=0.10",
+            optim_target_modules=[r".*attn.*", r".*mlp.*"],
+        )
+        trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
 
-            # Check this works
-            _ = trainer.train()
+        # Check this works
+        _ = trainer.train()
 
     @require_galore_torch
     @require_torch_gpu
@@ -1995,19 +2003,18 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
         x = torch.randint(0, 100, (128,))
         train_dataset = RepeatDataset(x)
 
-        with tempfile.TemporaryDirectory() as tmpdir:
-            # Trainer without inf/nan filter
-            args = TrainingArguments(
-                tmpdir,
-                learning_rate=1e-9,
-                logging_steps=5,
-                optim="galore_adamw_layerwise",
-                optim_target_modules=[r".*attn.*", r".*mlp.*"],
-            )
-            trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
+        # Trainer without inf/nan filter
+        args = TrainingArguments(
+            self.get_auto_remove_tmp_dir(),
+            learning_rate=1e-9,
+            logging_steps=5,
+            optim="galore_adamw_layerwise",
+            optim_target_modules=[r".*attn.*", r".*mlp.*"],
+        )
+        trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
 
-            # Check this works
-            _ = trainer.train()
+        # Check this works
+        _ = trainer.train()
 
     @require_galore_torch
     @require_torch_gpu
@@ -2017,20 +2024,19 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
         x = torch.randint(0, 100, (128,))
         train_dataset = RepeatDataset(x)
 
-        with tempfile.TemporaryDirectory() as tmpdir:
-            # Trainer without inf/nan filter
-            args = TrainingArguments(
-                tmpdir,
-                learning_rate=1e-9,
-                logging_steps=5,
-                optim="galore_adamw_layerwise",
-                lr_scheduler_type="cosine",
-                optim_target_modules=[r".*attn.*", r".*mlp.*"],
-            )
-            trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
+        # Trainer without inf/nan filter
+        args = TrainingArguments(
+            self.get_auto_remove_tmp_dir(),
+            learning_rate=1e-9,
+            logging_steps=5,
+            optim="galore_adamw_layerwise",
+            lr_scheduler_type="cosine",
+            optim_target_modules=[r".*attn.*", r".*mlp.*"],
+        )
+        trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
 
-            # Check this works
-            _ = trainer.train()
+        # Check this works
+        _ = trainer.train()
 
     @require_galore_torch
     @require_torch_gpu
@@ -2040,19 +2046,18 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
         x = torch.randint(0, 100, (128,))
         train_dataset = RepeatDataset(x)
 
-        with tempfile.TemporaryDirectory() as tmpdir:
-            # Trainer without inf/nan filter
-            args = TrainingArguments(
-                tmpdir,
-                learning_rate=1e-9,
-                logging_steps=5,
-                optim="galore_adamw_8bit",
-                optim_target_modules=[r".*attn.*", r".*mlp.*"],
-            )
-            trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
+        # Trainer without inf/nan filter
+        args = TrainingArguments(
+            self.get_auto_remove_tmp_dir(),
+            learning_rate=1e-9,
+            logging_steps=5,
+            optim="galore_adamw_8bit",
+            optim_target_modules=[r".*attn.*", r".*mlp.*"],
+        )
+        trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
 
-            # Check this works
-            _ = trainer.train()
+        # Check this works
+        _ = trainer.train()
 
     @require_galore_torch
     @require_torch_gpu
@@ -2156,23 +2161,22 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
         x = torch.randint(0, 100, (128,))
         train_dataset = RepeatDataset(x)
 
-        with tempfile.TemporaryDirectory() as tmpdir:
-            learning_rate = 1e-9
-            num_steps = 10
+        learning_rate = 1e-9
+        num_steps = 10
 
-            # Trainer without inf/nan filter
-            args = TrainingArguments(
-                tmpdir,
-                learning_rate=learning_rate,
-                logging_steps=5,
-                optim="galore_adamw",
-                optim_target_modules=[r".*attn.*", r".*mlp.*"],
-            )
-            trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
-            trainer.create_optimizer_and_scheduler(num_training_steps=num_steps)
+        # Trainer without inf/nan filter
+        args = TrainingArguments(
+            self.get_auto_remove_tmp_dir(),
+            learning_rate=learning_rate,
+            logging_steps=5,
+            optim="galore_adamw",
+            optim_target_modules=[r".*attn.*", r".*mlp.*"],
+        )
+        trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
+        trainer.create_optimizer_and_scheduler(num_training_steps=num_steps)
 
-            # reflects displayed lr in trainer
-            self.assertEqual(trainer.get_learning_rates(), [learning_rate, learning_rate])
+        # reflects displayed lr in trainer
+        self.assertEqual(trainer.get_learning_rates(), [learning_rate, learning_rate])
 
     @require_galore_torch
     @require_torch_gpu
@@ -2182,49 +2186,48 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
         x = torch.randint(0, 100, (128,))
         train_dataset = RepeatDataset(x)
 
-        with tempfile.TemporaryDirectory() as tmpdir:
-            learning_rate = 2e-4
-            num_train_epochs = 2
-            num_warmup_steps = 5
+        learning_rate = 2e-4
+        num_train_epochs = 2
+        num_warmup_steps = 5
 
-            # Trainer without inf/nan filter
-            args = TrainingArguments(
-                tmpdir,
-                num_train_epochs=num_train_epochs,
-                learning_rate=learning_rate,
-                warmup_steps=num_warmup_steps,
-                lr_scheduler_type="cosine",
-                logging_steps=1,
-                optim="galore_adamw",
-                optim_target_modules=[r".*attn.*", r".*mlp.*"],
-            )
-            trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
+        # Trainer without inf/nan filter
+        args = TrainingArguments(
+            self.get_auto_remove_tmp_dir(),
+            num_train_epochs=num_train_epochs,
+            learning_rate=learning_rate,
+            warmup_steps=num_warmup_steps,
+            lr_scheduler_type="cosine",
+            logging_steps=1,
+            optim="galore_adamw",
+            optim_target_modules=[r".*attn.*", r".*mlp.*"],
+        )
+        trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
 
-            # creating log history of trainer, results don't matter
-            trainer.train()
-            logs = trainer.state.log_history[1:][:-1]
+        # creating log history of trainer, results don't matter
+        trainer.train()
+        logs = trainer.state.log_history[1:][:-1]
 
-            # reach given learning rate peak and end with 0 lr
-            self.assertTrue(logs[num_warmup_steps - 2]["learning_rate"] == learning_rate)
-            self.assertTrue(logs[-1]["learning_rate"] == 0)
+        # reach given learning rate peak and end with 0 lr
+        self.assertTrue(logs[num_warmup_steps - 2]["learning_rate"] == learning_rate)
+        self.assertTrue(logs[-1]["learning_rate"] == 0)
 
-            # increasing and decreasing pattern of lrs
-            increasing_lrs = [
-                logs[i]["learning_rate"] < logs[i + 1]["learning_rate"]
-                for i in range(len(logs))
-                if i < num_warmup_steps - 2
-            ]
-            decreasing_lrs = [
-                logs[i]["learning_rate"] > logs[i + 1]["learning_rate"]
-                for i in range(len(logs) - 1)
-                if i >= num_warmup_steps - 2
-            ]
+        # increasing and decreasing pattern of lrs
+        increasing_lrs = [
+            logs[i]["learning_rate"] < logs[i + 1]["learning_rate"]
+            for i in range(len(logs))
+            if i < num_warmup_steps - 2
+        ]
+        decreasing_lrs = [
+            logs[i]["learning_rate"] > logs[i + 1]["learning_rate"]
+            for i in range(len(logs) - 1)
+            if i >= num_warmup_steps - 2
+        ]
 
-            self.assertTrue(all(increasing_lrs))
-            self.assertTrue(all(decreasing_lrs))
+        self.assertTrue(all(increasing_lrs))
+        self.assertTrue(all(decreasing_lrs))
 
-            # warm up steps << total steps
-            self.assertTrue(len(decreasing_lrs) > len(increasing_lrs))
+        # warm up steps << total steps
+        self.assertTrue(len(decreasing_lrs) > len(increasing_lrs))
 
     @require_torch_multi_accelerator
     def test_data_is_not_parallelized_when_model_is_parallel(self):
@@ -2707,86 +2710,84 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
                 self.assertNotIn(log_info_string, cl.out)
 
     def test_save_checkpoints(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            trainer = get_regression_trainer(output_dir=tmpdir, save_steps=5)
-            trainer.train()
-            self.check_saved_checkpoints(tmpdir, 5, int(self.n_epochs * 64 / self.batch_size))
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        trainer = get_regression_trainer(output_dir=tmp_dir, save_steps=5)
+        trainer.train()
+        self.check_saved_checkpoints(tmp_dir, 5, int(self.n_epochs * 64 / self.batch_size))
 
         # With a regular model that is not a PreTrainedModel
-        with tempfile.TemporaryDirectory() as tmpdir:
-            trainer = get_regression_trainer(output_dir=tmpdir, save_steps=5, pretrained=False)
-            trainer.train()
-            self.check_saved_checkpoints(tmpdir, 5, int(self.n_epochs * 64 / self.batch_size), False)
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        trainer = get_regression_trainer(output_dir=tmp_dir, save_steps=5, pretrained=False)
+        trainer.train()
+        self.check_saved_checkpoints(tmp_dir, 5, int(self.n_epochs * 64 / self.batch_size), False)
 
     @require_safetensors
     def test_safe_checkpoints(self):
         for save_safetensors in [True, False]:
-            with tempfile.TemporaryDirectory() as tmpdir:
-                trainer = get_regression_trainer(output_dir=tmpdir, save_steps=5, save_safetensors=save_safetensors)
-                trainer.train()
-                self.check_saved_checkpoints(
-                    tmpdir, 5, int(self.n_epochs * 64 / self.batch_size), safe_weights=save_safetensors
-                )
+            tmp_dir = self.get_auto_remove_tmp_dir()
+            trainer = get_regression_trainer(output_dir=tmp_dir, save_steps=5, save_safetensors=save_safetensors)
+            trainer.train()
+            self.check_saved_checkpoints(
+                tmp_dir, 5, int(self.n_epochs * 64 / self.batch_size), safe_weights=save_safetensors
+            )
 
             # With a regular model that is not a PreTrainedModel
-            with tempfile.TemporaryDirectory() as tmpdir:
-                trainer = get_regression_trainer(
-                    output_dir=tmpdir, save_steps=5, pretrained=False, save_safetensors=save_safetensors
-                )
-                trainer.train()
-                self.check_saved_checkpoints(
-                    tmpdir, 5, int(self.n_epochs * 64 / self.batch_size), False, safe_weights=save_safetensors
-                )
-
-    def test_load_best_model_with_save(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
+            tmp_dir = self.get_auto_remove_tmp_dir()
             trainer = get_regression_trainer(
-                output_dir=tmpdir,
-                save_steps=5,
-                evaluation_strategy="steps",
-                eval_steps=5,
-                max_steps=9,
+                output_dir=tmp_dir, save_steps=5, pretrained=False, save_safetensors=save_safetensors
             )
             trainer.train()
-            # Check that we have the last known step:
-            assert os.path.exists(
-                os.path.join(tmpdir, f"checkpoint-{trainer.state.max_steps}")
-            ), f"Could not find checkpoint-{trainer.state.max_steps}"
-            # And then check the last step
-            assert os.path.exists(os.path.join(tmpdir, "checkpoint-9")), "Could not find checkpoint-9"
+            self.check_saved_checkpoints(
+                tmp_dir, 5, int(self.n_epochs * 64 / self.batch_size), False, safe_weights=save_safetensors
+            )
+
+    def test_load_best_model_with_save(self):
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        trainer = get_regression_trainer(
+            output_dir=tmp_dir,
+            save_steps=5,
+            evaluation_strategy="steps",
+            eval_steps=5,
+            max_steps=9,
+        )
+        trainer.train()
+        # Check that we have the last known step:
+        assert os.path.exists(
+            os.path.join(tmp_dir, f"checkpoint-{trainer.state.max_steps}")
+        ), f"Could not find checkpoint-{trainer.state.max_steps}"
+        # And then check the last step
+        assert os.path.exists(os.path.join(tmp_dir, "checkpoint-9")), "Could not find checkpoint-9"
 
         # Now test that using a limit works
         # Should result in:
         # - save at step 5 (but is deleted)
         # - save at step 10 (loaded in at the end when `load_best_model=True`)
         # - save at step 11
-        with tempfile.TemporaryDirectory() as tmpdir:
-            trainer = get_regression_trainer(
-                output_dir=tmpdir,
-                save_steps=5,
-                evaluation_strategy="steps",
-                eval_steps=5,
-                load_best_model_at_end=True,
-                save_total_limit=2,
-                max_steps=11,
-            )
-            trainer.train()
-            # Check that we have the last known step:
-            assert os.path.exists(os.path.join(tmpdir, "checkpoint-11")), "Could not find checkpoint-11"
-            # And then check the last multiple
-            assert os.path.exists(os.path.join(tmpdir, "checkpoint-10")), "Could not find checkpoint-10"
-            # Finally check that we don't have an old one
-            assert not os.path.exists(os.path.join(tmpdir, "checkpoint-5")), "Found checkpoint-5, limit not respected"
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        trainer = get_regression_trainer(
+            output_dir=tmp_dir,
+            save_steps=5,
+            evaluation_strategy="steps",
+            eval_steps=5,
+            load_best_model_at_end=True,
+            save_total_limit=2,
+            max_steps=11,
+        )
+        trainer.train()
+        # Check that we have the last known step:
+        assert os.path.exists(os.path.join(tmp_dir, "checkpoint-11")), "Could not find checkpoint-11"
+        # And then check the last multiple
+        assert os.path.exists(os.path.join(tmp_dir, "checkpoint-10")), "Could not find checkpoint-10"
+        # Finally check that we don't have an old one
+        assert not os.path.exists(os.path.join(tmp_dir, "checkpoint-5")), "Found checkpoint-5, limit not respected"
 
-            # Finally check that the right model was loaded in, checkpoint-10
-            # this goes by the last `eval` step check to do so, so it won't be
-            # the last model *saved*
-            model_state = trainer.model.state_dict()
-            final_model_weights = safetensors.torch.load_file(
-                os.path.join(tmpdir, "checkpoint-10", "model.safetensors")
-            )
-            for k, v in model_state.items():
-                assert torch.allclose(v, final_model_weights[k]), f"{k} is not the same"
+        # Finally check that the right model was loaded in, checkpoint-10
+        # this goes by the last `eval` step check to do so, so it won't be
+        # the last model *saved*
+        model_state = trainer.model.state_dict()
+        final_model_weights = safetensors.torch.load_file(os.path.join(tmp_dir, "checkpoint-10", "model.safetensors"))
+        for k, v in model_state.items():
+            assert torch.allclose(v, final_model_weights[k]), f"{k} is not the same"
 
     @require_torch_multi_accelerator
     def test_run_seq2seq_double_train_wrap_once(self):
@@ -2794,13 +2795,12 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
         # since wrapping primarily happens on multi-gpu setup we want multiple gpus to test for
         # example DataParallel(DataParallel(model))
 
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            trainer = get_regression_trainer(output_dir=tmp_dir)
-            trainer.train()
-            model_wrapped_before = trainer.model_wrapped
-            trainer.train()
-            model_wrapped_after = trainer.model_wrapped
-            self.assertIs(model_wrapped_before, model_wrapped_after, "should be not wrapped twice")
+        trainer = get_regression_trainer(output_dir=self.get_auto_remove_tmp_dir())
+        trainer.train()
+        model_wrapped_before = trainer.model_wrapped
+        trainer.train()
+        model_wrapped_after = trainer.model_wrapped
+        self.assertIs(model_wrapped_before, model_wrapped_after, "should be not wrapped twice")
 
     @require_torch_up_to_2_accelerators
     def test_can_resume_training(self):
@@ -2808,99 +2808,99 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
         # save_steps, the checkpoint will resume training at epoch 2 or more (so the data seen by the model
         # won't be the same since the training dataloader is shuffled).
 
-        with tempfile.TemporaryDirectory() as tmpdir:
-            kwargs = {
-                "output_dir": tmpdir,
-                "train_len": 128,
-                "save_steps": 5,
-                "learning_rate": 0.1,
-                "logging_steps": 5,
-            }
-            trainer = get_regression_trainer(**kwargs)
-            trainer.train()
-            (a, b) = trainer.model.a.item(), trainer.model.b.item()
-            state = dataclasses.asdict(trainer.state)
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        kwargs = {
+            "output_dir": tmp_dir,
+            "train_len": 128,
+            "save_steps": 5,
+            "learning_rate": 0.1,
+            "logging_steps": 5,
+        }
+        trainer = get_regression_trainer(**kwargs)
+        trainer.train()
+        (a, b) = trainer.model.a.item(), trainer.model.b.item()
+        state = dataclasses.asdict(trainer.state)
 
-            checkpoint = os.path.join(tmpdir, "checkpoint-5")
+        checkpoint = os.path.join(tmp_dir, "checkpoint-5")
 
-            # Reinitialize trainer
-            trainer = get_regression_trainer(**kwargs)
+        # Reinitialize trainer
+        trainer = get_regression_trainer(**kwargs)
 
-            trainer.train(resume_from_checkpoint=checkpoint)
-            (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
-            state1 = dataclasses.asdict(trainer.state)
-            self.assertEqual(a, a1)
-            self.assertEqual(b, b1)
-            self.check_trainer_state_are_the_same(state, state1)
+        trainer.train(resume_from_checkpoint=checkpoint)
+        (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
+        state1 = dataclasses.asdict(trainer.state)
+        self.assertEqual(a, a1)
+        self.assertEqual(b, b1)
+        self.check_trainer_state_are_the_same(state, state1)
 
-            # Now check with a later checkpoint that it also works when we span over one epoch
-            checkpoint = os.path.join(tmpdir, "checkpoint-15")
+        # Now check with a later checkpoint that it also works when we span over one epoch
+        checkpoint = os.path.join(tmp_dir, "checkpoint-15")
 
-            # Reinitialize trainer and load model
-            trainer = get_regression_trainer(**kwargs)
+        # Reinitialize trainer and load model
+        trainer = get_regression_trainer(**kwargs)
 
-            trainer.train(resume_from_checkpoint=checkpoint)
-            (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
-            state1 = dataclasses.asdict(trainer.state)
-            self.assertEqual(a, a1)
-            self.assertEqual(b, b1)
-            self.check_trainer_state_are_the_same(state, state1)
+        trainer.train(resume_from_checkpoint=checkpoint)
+        (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
+        state1 = dataclasses.asdict(trainer.state)
+        self.assertEqual(a, a1)
+        self.assertEqual(b, b1)
+        self.check_trainer_state_are_the_same(state, state1)
 
         # With a regular model that is not a PreTrainedModel
-        with tempfile.TemporaryDirectory() as tmpdir:
-            kwargs = {
-                "output_dir": tmpdir,
-                "train_len": 128,
-                "save_steps": 5,
-                "learning_rate": 0.1,
-                "pretrained": False,
-            }
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        kwargs = {
+            "output_dir": tmp_dir,
+            "train_len": 128,
+            "save_steps": 5,
+            "learning_rate": 0.1,
+            "pretrained": False,
+        }
 
-            trainer = get_regression_trainer(**kwargs)
-            trainer.train()
-            (a, b) = trainer.model.a.item(), trainer.model.b.item()
-            state = dataclasses.asdict(trainer.state)
+        trainer = get_regression_trainer(**kwargs)
+        trainer.train()
+        (a, b) = trainer.model.a.item(), trainer.model.b.item()
+        state = dataclasses.asdict(trainer.state)
 
-            checkpoint = os.path.join(tmpdir, "checkpoint-5")
+        checkpoint = os.path.join(tmp_dir, "checkpoint-5")
 
-            # Reinitialize trainer and load model
-            trainer = get_regression_trainer(**kwargs)
+        # Reinitialize trainer and load model
+        trainer = get_regression_trainer(**kwargs)
 
-            trainer.train(resume_from_checkpoint=checkpoint)
-            (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
-            state1 = dataclasses.asdict(trainer.state)
-            self.assertEqual(a, a1)
-            self.assertEqual(b, b1)
-            self.check_trainer_state_are_the_same(state, state1)
+        trainer.train(resume_from_checkpoint=checkpoint)
+        (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
+        state1 = dataclasses.asdict(trainer.state)
+        self.assertEqual(a, a1)
+        self.assertEqual(b, b1)
+        self.check_trainer_state_are_the_same(state, state1)
 
-            # Now check with a later checkpoint that it also works when we span over one epoch
-            checkpoint = os.path.join(tmpdir, "checkpoint-15")
+        # Now check with a later checkpoint that it also works when we span over one epoch
+        checkpoint = os.path.join(tmp_dir, "checkpoint-15")
 
-            # Reinitialize trainer and load model
-            trainer = get_regression_trainer(**kwargs)
+        # Reinitialize trainer and load model
+        trainer = get_regression_trainer(**kwargs)
 
-            trainer.train(resume_from_checkpoint=checkpoint)
-            (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
-            state1 = dataclasses.asdict(trainer.state)
-            self.assertEqual(a, a1)
-            self.assertEqual(b, b1)
-            self.check_trainer_state_are_the_same(state, state1)
+        trainer.train(resume_from_checkpoint=checkpoint)
+        (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
+        state1 = dataclasses.asdict(trainer.state)
+        self.assertEqual(a, a1)
+        self.assertEqual(b, b1)
+        self.check_trainer_state_are_the_same(state, state1)
 
         # Now check failures
 
         # 1. fail to find a bogus checkpoint
-        with tempfile.TemporaryDirectory() as tmpdir:
-            trainer = get_regression_trainer(output_dir=tmpdir)
-            with self.assertRaises(Exception) as context:
-                trainer.train(resume_from_checkpoint=f"{checkpoint}-bogus")
-            self.assertTrue("Can't find a valid checkpoint at" in str(context.exception))
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        trainer = get_regression_trainer(output_dir=tmp_dir)
+        with self.assertRaises(Exception) as context:
+            trainer.train(resume_from_checkpoint=f"{checkpoint}-bogus")
+        self.assertTrue("Can't find a valid checkpoint at" in str(context.exception))
 
         # 2. fail to find any checkpoint - due a fresh output_dir
-        with tempfile.TemporaryDirectory() as tmpdir:
-            trainer = get_regression_trainer(output_dir=tmpdir)
-            with self.assertRaises(Exception) as context:
-                trainer.train(resume_from_checkpoint=True)
-            self.assertTrue("No valid checkpoint found in output directory" in str(context.exception))
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        trainer = get_regression_trainer(output_dir=tmp_dir)
+        with self.assertRaises(Exception) as context:
+            trainer.train(resume_from_checkpoint=True)
+        self.assertTrue("No valid checkpoint found in output directory" in str(context.exception))
 
     @unittest.skip(
         reason="@muellerzr: Fix once Trainer can take an accelerate configuration. Need to set `seedable_sampler=True`."