From 48833071c0f8a7810374e8fe89b58afa7cd8c970 Mon Sep 17 00:00:00 2001
From: Marc Sun <marc@huggingface.co>
Date: Mon, 9 Dec 2024 15:20:48 +0000
Subject: [PATCH] temporary solution to handle saving file from dduf format

---
 .../models/t5/tokenization_t5_fast.py             | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/t5/tokenization_t5_fast.py b/src/transformers/models/t5/tokenization_t5_fast.py
index 9c150d093..0734cfd64 100644
--- a/src/transformers/models/t5/tokenization_t5_fast.py
+++ b/src/transformers/models/t5/tokenization_t5_fast.py
@@ -95,6 +95,7 @@ class T5TokenizerFast(PreTrainedTokenizerFast):
         add_prefix_space=None,
         **kwargs,
     ):
+        self.dduf_entries = kwargs.get("dduf_entries", None)
         # Add extra_ids to the special token list
         if additional_special_tokens is not None:
             extra_tokens = [x for x in additional_special_tokens if "<extra_id_" in str(x)]
@@ -132,7 +133,9 @@ class T5TokenizerFast(PreTrainedTokenizerFast):
 
     @property
     def can_save_slow_tokenizer(self) -> bool:
-        return os.path.isfile(self.vocab_file) if self.vocab_file else False
+        # TODO: update this. Putting it to True for now
+        # return os.path.isfile(self.vocab_file) if self.vocab_file else False
+        return True
 
     @staticmethod
     def _eventually_correct_t5_max_length(pretrained_model_name_or_path, max_model_length, init_max_model_length):
@@ -173,10 +176,12 @@ class T5TokenizerFast(PreTrainedTokenizerFast):
         if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
             copyfile(self.vocab_file, out_vocab_file)
             logger.info(f"Copy vocab file to {out_vocab_file}")
-        elif not os.path.isfile(self.vocab_file):
-            with open(out_vocab_file, "wb") as fi:
-                content_spiece_model = self.sp_model.serialized_model_proto()
-                fi.write(content_spiece_model)
+        # copyfile don't work with binary content e.g when we load file from an archive
+        elif not os.path.isfile(self.vocab_file): 
+            with self.dduf_entries[self.vocab_file].as_mmap() as mm:
+                 with open(out_vocab_file, "wb") as out_file:
+                    out_file.write(mm)
+            logger.info(f"Copy vocab file to {out_vocab_file}")
 
         return (out_vocab_file,)