mirror of
https://github.com/saymrwulf/transformers.git
synced 2026-05-14 20:58:08 +00:00
add special tokens
This commit is contained in:
parent
ed98ad35e6
commit
f0adc01768
1 changed files with 3 additions and 1 deletions
|
|
@ -1377,6 +1377,7 @@ INIT_TOKENIZER_DOCSTRING = r"""
|
|||
internal state of the tokenizer. The default behavior is to not split special tokens. This means that if
|
||||
`<s>` is the `bos_token`, then `tokenizer.tokenize("<s>") = ['<s>`]. Otherwise, if
|
||||
`split_special_tokens=True`, then `tokenizer.tokenize("<s>")` will be give `['<','s', '>']`.
|
||||
add_special_tokens (`bool`, *optional*): Whether or not to add special tokens when encoding.
|
||||
"""
|
||||
|
||||
|
||||
|
|
@ -1432,6 +1433,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
|||
|
||||
# By default, cleaning tokenization spaces for both fast and slow tokenizers
|
||||
self.clean_up_tokenization_spaces = kwargs.pop("clean_up_tokenization_spaces", False)
|
||||
self._add_special_tokens = kwargs.pop("add_special_tokens", None)
|
||||
|
||||
# By default, do not split special tokens for both fast and slow tokenizers
|
||||
self.split_special_tokens = kwargs.pop("split_special_tokens", False)
|
||||
|
|
@ -3474,7 +3476,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
|||
Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize`
|
||||
and `convert_tokens_to_ids` methods.
|
||||
"""
|
||||
|
||||
add_special_tokens = self._add_special_tokens if self.add_special_tokens is not None else add_special_tokens
|
||||
# Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
|
||||
padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
|
||||
padding=padding,
|
||||
|
|
|
|||
Loading…
Reference in a new issue