temporary solution to handle saving file from dduf format

2026-05-14 20:58:08 +00:00 · 2024-12-09 15:20:48 +00:00 · 2024-12-09 15:20:48 +00:00 · 48833071c0
commit 48833071c0
parent 734a186fd2
1 changed files with 10 additions and 5 deletions
--- a/src/transformers/models/t5/tokenization_t5_fast.py
+++ b/src/transformers/models/t5/tokenization_t5_fast.py
@ -95,6 +95,7 @@ class T5TokenizerFast(PreTrainedTokenizerFast):
        add_prefix_space=None,
        **kwargs,
    ):
+        self.dduf_entries = kwargs.get("dduf_entries", None)
        # Add extra_ids to the special token list
        if additional_special_tokens is not None:
            extra_tokens = [x for x in additional_special_tokens if "<extra_id_" in str(x)]
@ -132,7 +133,9 @@ class T5TokenizerFast(PreTrainedTokenizerFast):

    @property
    def can_save_slow_tokenizer(self) -> bool:
-        return os.path.isfile(self.vocab_file) if self.vocab_file else False
+        # TODO: update this. Putting it to True for now
+        # return os.path.isfile(self.vocab_file) if self.vocab_file else False
+        return True

    @staticmethod
    def _eventually_correct_t5_max_length(pretrained_model_name_or_path, max_model_length, init_max_model_length):
@ -173,10 +176,12 @@ class T5TokenizerFast(PreTrainedTokenizerFast):
        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
            copyfile(self.vocab_file, out_vocab_file)
            logger.info(f"Copy vocab file to {out_vocab_file}")
-        elif not os.path.isfile(self.vocab_file):
-            with open(out_vocab_file, "wb") as fi:
-                content_spiece_model = self.sp_model.serialized_model_proto()
-                fi.write(content_spiece_model)
+        # copyfile don't work with binary content e.g when we load file from an archive
+        elif not os.path.isfile(self.vocab_file): 
+            with self.dduf_entries[self.vocab_file].as_mmap() as mm:
+                 with open(out_vocab_file, "wb") as out_file:
+                    out_file.write(mm)
+            logger.info(f"Copy vocab file to {out_vocab_file}")

        return (out_vocab_file,)