temporary solution to handle saving file from dduf format

This commit is contained in:
Marc Sun 2024-12-09 15:20:48 +00:00
parent 734a186fd2
commit 48833071c0

View file

@ -95,6 +95,7 @@ class T5TokenizerFast(PreTrainedTokenizerFast):
add_prefix_space=None,
**kwargs,
):
self.dduf_entries = kwargs.get("dduf_entries", None)
# Add extra_ids to the special token list
if additional_special_tokens is not None:
extra_tokens = [x for x in additional_special_tokens if "<extra_id_" in str(x)]
@ -132,7 +133,9 @@ class T5TokenizerFast(PreTrainedTokenizerFast):
@property
def can_save_slow_tokenizer(self) -> bool:
return os.path.isfile(self.vocab_file) if self.vocab_file else False
# TODO: update this. Putting it to True for now
# return os.path.isfile(self.vocab_file) if self.vocab_file else False
return True
@staticmethod
def _eventually_correct_t5_max_length(pretrained_model_name_or_path, max_model_length, init_max_model_length):
@ -173,10 +176,12 @@ class T5TokenizerFast(PreTrainedTokenizerFast):
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
copyfile(self.vocab_file, out_vocab_file)
logger.info(f"Copy vocab file to {out_vocab_file}")
elif not os.path.isfile(self.vocab_file):
with open(out_vocab_file, "wb") as fi:
content_spiece_model = self.sp_model.serialized_model_proto()
fi.write(content_spiece_model)
# copyfile don't work with binary content e.g when we load file from an archive
elif not os.path.isfile(self.vocab_file):
with self.dduf_entries[self.vocab_file].as_mmap() as mm:
with open(out_vocab_file, "wb") as out_file:
out_file.write(mm)
logger.info(f"Copy vocab file to {out_vocab_file}")
return (out_vocab_file,)