From f0adc0176835f8756220314b4d6ee2fca6ffa773 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 6 Feb 2025 15:14:03 +0100
Subject: [PATCH] add special tokens

---
 src/transformers/tokenization_utils_base.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 7ad36ab01..b19d93b11 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -1377,6 +1377,7 @@ INIT_TOKENIZER_DOCSTRING = r"""
             internal state of the tokenizer. The default behavior is to not split special tokens. This means that if
             `<s>` is the `bos_token`, then `tokenizer.tokenize("<s>") = ['<s>`]. Otherwise, if
             `split_special_tokens=True`, then `tokenizer.tokenize("<s>")` will be give `['<','s', '>']`.
+        add_special_tokens (`bool`, *optional*): Whether or not to add special tokens when encoding.
 """
 
 
@@ -1432,6 +1433,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
 
         # By default, cleaning tokenization spaces for both fast and slow tokenizers
         self.clean_up_tokenization_spaces = kwargs.pop("clean_up_tokenization_spaces", False)
+        self._add_special_tokens = kwargs.pop("add_special_tokens", None)
 
         # By default, do not split special tokens for both fast and slow tokenizers
         self.split_special_tokens = kwargs.pop("split_special_tokens", False)
@@ -3474,7 +3476,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
                 Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize`
                 and `convert_tokens_to_ids` methods.
         """
-
+        add_special_tokens = self._add_special_tokens if self.add_special_tokens is not None else add_special_tokens
         # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
         padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
             padding=padding,