add special tokens

2026-05-14 20:58:08 +00:00 · 2025-02-06 15:14:03 +01:00 · 2025-02-06 15:14:03 +01:00 · f0adc01768
commit f0adc01768
parent ed98ad35e6
1 changed files with 3 additions and 1 deletions
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@ -1377,6 +1377,7 @@ INIT_TOKENIZER_DOCSTRING = r"""
            internal state of the tokenizer. The default behavior is to not split special tokens. This means that if
            `<s>` is the `bos_token`, then `tokenizer.tokenize("<s>") = ['<s>`]. Otherwise, if
            `split_special_tokens=True`, then `tokenizer.tokenize("<s>")` will be give `['<','s', '>']`.
+        add_special_tokens (`bool`, *optional*): Whether or not to add special tokens when encoding.
 """


@ -1432,6 +1433,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):

        # By default, cleaning tokenization spaces for both fast and slow tokenizers
        self.clean_up_tokenization_spaces = kwargs.pop("clean_up_tokenization_spaces", False)
+        self._add_special_tokens = kwargs.pop("add_special_tokens", None)

        # By default, do not split special tokens for both fast and slow tokenizers
        self.split_special_tokens = kwargs.pop("split_special_tokens", False)
@ -3474,7 +3476,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize`
                and `convert_tokens_to_ids` methods.
        """
-
+        add_special_tokens = self._add_special_tokens if self.add_special_tokens is not None else add_special_tokens
        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
            padding=padding,