add special tokens

This commit is contained in:
Arthur Zucker 2025-02-06 15:14:03 +01:00
parent ed98ad35e6
commit f0adc01768

View file

@ -1377,6 +1377,7 @@ INIT_TOKENIZER_DOCSTRING = r"""
internal state of the tokenizer. The default behavior is to not split special tokens. This means that if
`<s>` is the `bos_token`, then `tokenizer.tokenize("<s>") = ['<s>`]. Otherwise, if
`split_special_tokens=True`, then `tokenizer.tokenize("<s>")` will be give `['<','s', '>']`.
add_special_tokens (`bool`, *optional*): Whether or not to add special tokens when encoding.
"""
@ -1432,6 +1433,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
# By default, cleaning tokenization spaces for both fast and slow tokenizers
self.clean_up_tokenization_spaces = kwargs.pop("clean_up_tokenization_spaces", False)
self._add_special_tokens = kwargs.pop("add_special_tokens", None)
# By default, do not split special tokens for both fast and slow tokenizers
self.split_special_tokens = kwargs.pop("split_special_tokens", False)
@ -3474,7 +3476,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize`
and `convert_tokens_to_ids` methods.
"""
add_special_tokens = self._add_special_tokens if self.add_special_tokens is not None else add_special_tokens
# Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
padding=padding,