diff --git a/src/transformers/models/t5/tokenization_t5_fast.py b/src/transformers/models/t5/tokenization_t5_fast.py index 9c150d093..0734cfd64 100644 --- a/src/transformers/models/t5/tokenization_t5_fast.py +++ b/src/transformers/models/t5/tokenization_t5_fast.py @@ -95,6 +95,7 @@ class T5TokenizerFast(PreTrainedTokenizerFast): add_prefix_space=None, **kwargs, ): + self.dduf_entries = kwargs.get("dduf_entries", None) # Add extra_ids to the special token list if additional_special_tokens is not None: extra_tokens = [x for x in additional_special_tokens if " bool: - return os.path.isfile(self.vocab_file) if self.vocab_file else False + # TODO: update this. Putting it to True for now + # return os.path.isfile(self.vocab_file) if self.vocab_file else False + return True @staticmethod def _eventually_correct_t5_max_length(pretrained_model_name_or_path, max_model_length, init_max_model_length): @@ -173,10 +176,12 @@ class T5TokenizerFast(PreTrainedTokenizerFast): if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): copyfile(self.vocab_file, out_vocab_file) logger.info(f"Copy vocab file to {out_vocab_file}") - elif not os.path.isfile(self.vocab_file): - with open(out_vocab_file, "wb") as fi: - content_spiece_model = self.sp_model.serialized_model_proto() - fi.write(content_spiece_model) + # copyfile don't work with binary content e.g when we load file from an archive + elif not os.path.isfile(self.vocab_file): + with self.dduf_entries[self.vocab_file].as_mmap() as mm: + with open(out_vocab_file, "wb") as out_file: + out_file.write(mm) + logger.info(f"Copy vocab file to {out_vocab_file}") return (out_vocab_file,)