From f0adc0176835f8756220314b4d6ee2fca6ffa773 Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Thu, 6 Feb 2025 15:14:03 +0100 Subject: [PATCH] add special tokens --- src/transformers/tokenization_utils_base.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 7ad36ab01..b19d93b11 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1377,6 +1377,7 @@ INIT_TOKENIZER_DOCSTRING = r""" internal state of the tokenizer. The default behavior is to not split special tokens. This means that if `` is the `bos_token`, then `tokenizer.tokenize("") = ['`]. Otherwise, if `split_special_tokens=True`, then `tokenizer.tokenize("")` will be give `['<','s', '>']`. + add_special_tokens (`bool`, *optional*): Whether or not to add special tokens when encoding. """ @@ -1432,6 +1433,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): # By default, cleaning tokenization spaces for both fast and slow tokenizers self.clean_up_tokenization_spaces = kwargs.pop("clean_up_tokenization_spaces", False) + self._add_special_tokens = kwargs.pop("add_special_tokens", None) # By default, do not split special tokens for both fast and slow tokenizers self.split_special_tokens = kwargs.pop("split_special_tokens", False) @@ -3474,7 +3476,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize` and `convert_tokens_to_ids` methods. """ - + add_special_tokens = self._add_special_tokens if self.add_special_tokens is not None else add_special_tokens # Backward compatibility for 'truncation_strategy', 'pad_to_max_length' padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( padding=padding,