From 48833071c0f8a7810374e8fe89b58afa7cd8c970 Mon Sep 17 00:00:00 2001 From: Marc Sun Date: Mon, 9 Dec 2024 15:20:48 +0000 Subject: [PATCH] temporary solution to handle saving file from dduf format --- .../models/t5/tokenization_t5_fast.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/transformers/models/t5/tokenization_t5_fast.py b/src/transformers/models/t5/tokenization_t5_fast.py index 9c150d093..0734cfd64 100644 --- a/src/transformers/models/t5/tokenization_t5_fast.py +++ b/src/transformers/models/t5/tokenization_t5_fast.py @@ -95,6 +95,7 @@ class T5TokenizerFast(PreTrainedTokenizerFast): add_prefix_space=None, **kwargs, ): + self.dduf_entries = kwargs.get("dduf_entries", None) # Add extra_ids to the special token list if additional_special_tokens is not None: extra_tokens = [x for x in additional_special_tokens if " bool: - return os.path.isfile(self.vocab_file) if self.vocab_file else False + # TODO: update this. Putting it to True for now + # return os.path.isfile(self.vocab_file) if self.vocab_file else False + return True @staticmethod def _eventually_correct_t5_max_length(pretrained_model_name_or_path, max_model_length, init_max_model_length): @@ -173,10 +176,12 @@ class T5TokenizerFast(PreTrainedTokenizerFast): if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): copyfile(self.vocab_file, out_vocab_file) logger.info(f"Copy vocab file to {out_vocab_file}") - elif not os.path.isfile(self.vocab_file): - with open(out_vocab_file, "wb") as fi: - content_spiece_model = self.sp_model.serialized_model_proto() - fi.write(content_spiece_model) + # copyfile don't work with binary content e.g when we load file from an archive + elif not os.path.isfile(self.vocab_file): + with self.dduf_entries[self.vocab_file].as_mmap() as mm: + with open(out_vocab_file, "wb") as out_file: + out_file.write(mm) + logger.info(f"Copy vocab file to {out_vocab_file}") return (out_vocab_file,)