diff --git a/.circleci/config.yml b/.circleci/config.yml index 12f13381f..7cd25f75b 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -559,6 +559,10 @@ jobs: if [ -f test_list.txt ]; then python -m pytest -s --make-reports=tests_custom_tokenizers ./tests/test_tokenization_bert_japanese.py ./tests/test_tokenization_openai.py | tee tests_output.txt fi + - run: | + if [ -f test_list.txt ]; then + python -m pytest -n 1 tests/test_tokenization_clip.py --dist=loadfile -s --make-reports=tests_tokenization_clip --durations=100 | tee tests_output.txt + fi - store_artifacts: path: ~/transformers/tests_output.txt - store_artifacts: diff --git a/setup.py b/setup.py index 7395a9599..adc79f9dc 100644 --- a/setup.py +++ b/setup.py @@ -105,6 +105,7 @@ _deps = [ "filelock", "flake8>=3.8.3", "flax>=0.3.5", + "ftfy", "fugashi>=1.0", "GitPython<3.1.19", "huggingface-hub>=0.1.0,<1.0", @@ -242,6 +243,7 @@ else: extras["flax"] = deps_list("jax", "jaxlib", "flax", "optax") extras["tokenizers"] = deps_list("tokenizers") +extras["ftfy"] = deps_list("ftfy") extras["onnxruntime"] = deps_list("onnxruntime", "onnxruntime-tools") extras["onnx"] = deps_list("onnxconverter-common", "tf2onnx") + extras["onnxruntime"] extras["modelcreation"] = deps_list("cookiecutter") diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py index 822b09849..92f25bd0c 100644 --- a/src/transformers/convert_slow_tokenizer.py +++ b/src/transformers/convert_slow_tokenizer.py @@ -823,6 +823,7 @@ class CLIPConverter(Converter): def converted(self) -> Tokenizer: vocab = self.original_tokenizer.encoder merges = list(self.original_tokenizer.bpe_ranks.keys()) + unk_token = self.original_tokenizer.unk_token tokenizer = Tokenizer( BPE( @@ -832,13 +833,32 @@ class CLIPConverter(Converter): continuing_subword_prefix="", end_of_word_suffix="", fuse_unk=False, + unk_token=str(unk_token), ) ) - tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=self.original_tokenizer.add_prefix_space) + tokenizer.normalizer = normalizers.Sequence( + [normalizers.NFC(), normalizers.Replace(Regex(r"\s+"), " "), normalizers.Lowercase()] + ) + tokenizer.pre_tokenizer = pre_tokenizers.Sequence( + [ + pre_tokenizers.Split( + Regex(r"""'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+"""), + behavior="removed", + invert=True, + ), + pre_tokenizers.ByteLevel(add_prefix_space=False), + ] + ) tokenizer.decoder = decoders.ByteLevel() - tokenizer.post_processor = processors.ByteLevel(trim_offsets=False) + # Hack to have a ByteLevel and TemplaceProcessor + tokenizer.post_processor = processors.RobertaProcessing( + sep=(self.original_tokenizer.eos_token, self.original_tokenizer.eos_token_id), + cls=(self.original_tokenizer.bos_token, self.original_tokenizer.bos_token_id), + add_prefix_space=False, + trim_offsets=False, + ) return tokenizer diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index 9c60024f0..8131c6f5e 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -15,6 +15,7 @@ deps = { "filelock": "filelock", "flake8": "flake8>=3.8.3", "flax": "flax>=0.3.5", + "ftfy": "ftfy", "fugashi": "fugashi>=1.0", "GitPython": "GitPython<3.1.19", "huggingface-hub": "huggingface-hub>=0.1.0,<1.0", diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py index 21194cb6f..8a19d298e 100644 --- a/src/transformers/file_utils.py +++ b/src/transformers/file_utils.py @@ -158,6 +158,13 @@ except importlib_metadata.PackageNotFoundError: except importlib_metadata.PackageNotFoundError: _faiss_available = False +_ftfy_available = importlib.util.find_spec("ftfy") is not None +try: + _ftfy_version = importlib_metadata.version("ftfy") + logger.debug(f"Successfully imported ftfy version {_ftfy_version}") +except importlib_metadata.PackageNotFoundError: + _ftfy_available = False + coloredlogs = importlib.util.find_spec("coloredlogs") is not None try: @@ -441,6 +448,10 @@ def is_flax_available(): return _flax_available +def is_ftfy_available(): + return _ftfy_available + + def is_torch_tpu_available(): if not _torch_available: return False @@ -516,10 +527,6 @@ def is_spacy_available(): return importlib.util.find_spec("spacy") is not None -def is_ftfy_available(): - return importlib.util.find_spec("ftfy") is not None - - def is_in_notebook(): try: # Test adapted from tqdm.autonotebook: https://github.com/tqdm/tqdm/blob/master/tqdm/autonotebook.py @@ -722,6 +729,13 @@ FLAX_IMPORT_ERROR = """ installation page: https://github.com/google/flax and follow the ones that match your environment. """ +# docstyle-ignore +FTFY_IMPORT_ERROR = """ +{0} requires the ftfy library but it was not found in your environment. Checkout the instructions on the +installation section: https://github.com/rspeer/python-ftfy/tree/master#installing and follow the ones +that match your environment. +""" + # docstyle-ignore SCATTER_IMPORT_ERROR = """ @@ -801,6 +815,7 @@ BACKENDS_MAPPING = OrderedDict( ("detectron2", (is_detectron2_available, DETECTRON2_IMPORT_ERROR)), ("faiss", (is_faiss_available, FAISS_IMPORT_ERROR)), ("flax", (is_flax_available, FLAX_IMPORT_ERROR)), + ("ftfy", (is_ftfy_available, FTFY_IMPORT_ERROR)), ("pandas", (is_pandas_available, PANDAS_IMPORT_ERROR)), ("phonemizer", (is_phonemizer_available, PHONEMIZER_IMPORT_ERROR)), ("protobuf", (is_protobuf_available, PROTOBUF_IMPORT_ERROR)), diff --git a/src/transformers/models/clip/tokenization_clip.py b/src/transformers/models/clip/tokenization_clip.py index 01a2a7550..ec2fe6e70 100644 --- a/src/transformers/models/clip/tokenization_clip.py +++ b/src/transformers/models/clip/tokenization_clip.py @@ -48,7 +48,7 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { PRETRAINED_INIT_CONFIGURATION = { - "openai/clip-vit-base-patch32": {"do_lower_case": True}, + "openai/clip-vit-base-patch32": {}, } @@ -101,19 +101,6 @@ class CLIPTokenizer(PreTrainedTokenizer): """ Construct a CLIP tokenizer. Based on byte-level Byte-Pair-Encoding. - This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will - be encoded differently whether it is at the beginning of the sentence (without space) or not: - - - You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you - call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance. - - - - When used with `is_split_into_words=True`, this tokenizer will add a space before each word (even the first one). - - - This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to this superclass for more information regarding those methods. @@ -132,9 +119,6 @@ class CLIPTokenizer(PreTrainedTokenizer): The beginning of sequence token. eos_token (`str`, *optional*, defaults to `<|endoftext|>`): The end of sequence token. - add_prefix_space (`bool`, *optional*, defaults to `False`): - Whether or not to add an initial space to the input. This allows to treat the leading word just as any - other word. (CLIP tokenizer detect beginning of words by the preceding space). """ vocab_files_names = VOCAB_FILES_NAMES @@ -151,8 +135,6 @@ class CLIPTokenizer(PreTrainedTokenizer): bos_token="<|startoftext|>", eos_token="<|endoftext|>", pad_token="<|endoftext|>", # hack to enable padding - add_prefix_space=False, - do_lower_case=True, **kwargs ): bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token @@ -165,8 +147,6 @@ class CLIPTokenizer(PreTrainedTokenizer): bos_token=bos_token, eos_token=eos_token, pad_token=pad_token, - add_prefix_space=add_prefix_space, - do_lower_case=do_lower_case, **kwargs, ) @@ -190,21 +170,12 @@ class CLIPTokenizer(PreTrainedTokenizer): bpe_merges = [tuple(merge.split()) for merge in bpe_merges] self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) self.cache = {"<|startoftext|>": "<|startoftext|>", "<|endoftext|>": "<|endoftext|>"} - self.add_prefix_space = add_prefix_space self.pat = re.compile( r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE, ) - # Very ugly hack to enable padding - @property - def pad_token_id(self) -> Optional[int]: - """ - `Optional[int]`: Id of the padding token in the vocabulary. Returns `None` if the token has not been set. - """ - return 0 - @property def vocab_size(self): return len(self.encoder) @@ -232,9 +203,12 @@ class CLIPTokenizer(PreTrainedTokenizer): Returns: `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. """ + bos_token = [self.bos_token_id] + eos_token = [self.eos_token_id] + if token_ids_1 is None: - return [self.bos_token_id] + token_ids_0 + [self.eos_token_id] - return [self.bos_token_id] + token_ids_0 + token_ids_1 + [self.eos_token_id] + return bos_token + token_ids_0 + eos_token + return bos_token + token_ids_0 + eos_token + eos_token + token_ids_1 + eos_token def get_special_tokens_mask( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False @@ -262,7 +236,30 @@ class CLIPTokenizer(PreTrainedTokenizer): if token_ids_1 is None: return [1] + ([0] * len(token_ids_0)) + [1] - return [1] + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + [1] + ([0] * len(token_ids_1)) + [1] + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed. CLIP does not make use of token type ids, therefore a list of + zeros is returned. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of zeros. + """ + bos_token = [self.bos_token_id] + eos_token = [self.eos_token_id] + + if token_ids_1 is None: + return len(bos_token + token_ids_0 + eos_token) * [0] + return len(bos_token + token_ids_0 + eos_token + eos_token + token_ids_1 + eos_token) * [0] def bpe(self, token): if token in self.cache: @@ -332,7 +329,8 @@ class CLIPTokenizer(PreTrainedTokenizer): def convert_tokens_to_string(self, tokens): """Converts a sequence of tokens (string) in a single string.""" text = "".join(tokens) - text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors).replace("", " ") + byte_array = bytearray([self.byte_decoder[c] for c in text]) + text = byte_array.decode("utf-8", errors=self.errors).replace("", " ").strip() return text def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: @@ -363,9 +361,3 @@ class CLIPTokenizer(PreTrainedTokenizer): index += 1 return vocab_file, merge_file - - def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs): - add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space) - if is_split_into_words or add_prefix_space: - text = " " + text - return (text, kwargs) diff --git a/src/transformers/models/clip/tokenization_clip_fast.py b/src/transformers/models/clip/tokenization_clip_fast.py index 42713e4b3..f6ff684c6 100644 --- a/src/transformers/models/clip/tokenization_clip_fast.py +++ b/src/transformers/models/clip/tokenization_clip_fast.py @@ -15,12 +15,10 @@ """Tokenization classes for OpenAI GPT.""" -import json -from typing import Optional, Tuple +from typing import List, Optional, Tuple from tokenizers import pre_tokenizers -from ...tokenization_utils_base import BatchEncoding from ...tokenization_utils_fast import PreTrainedTokenizerFast from ...utils import logging from .tokenization_clip import CLIPTokenizer @@ -52,27 +50,6 @@ class CLIPTokenizerFast(PreTrainedTokenizerFast): Construct a "fast" CLIP tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level Byte-Pair-Encoding. - This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will - be encoded differently whether it is at the beginning of the sentence (without space) or not: - - ``` - >>> from transformers import CLIPTokenizerFast - >>> tokenizer = CLIPTokenizerFast.from_pretrained("openai/clip-vit-base-patch32") - >>> tokenizer("Hello world")['input_ids'] - [15496, 995] - >>> tokenizer(" Hello world")['input_ids'] - [18435, 995] - ``` - - You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you - call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance. - - - - When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`. - - - This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should refer to this superclass for more information regarding those methods. @@ -81,9 +58,6 @@ class CLIPTokenizerFast(PreTrainedTokenizerFast): Path to the vocabulary file. merges_file (`str`): Path to the merges file. - errors (`str`, *optional*, defaults to `"replace"`): - Paradigm to follow when decoding bytes to UTF-8. See - [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information. unk_token (`str`, *optional*, defaults to `<|endoftext|>`): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. @@ -91,11 +65,6 @@ class CLIPTokenizerFast(PreTrainedTokenizerFast): The beginning of sequence token. eos_token (`str`, *optional*, defaults to `<|endoftext|>`): The end of sequence token. - add_prefix_space (`bool`, *optional*, defaults to `False`): - Whether or not to add an initial space to the input. This allows to treat the leading word just as any - other word. (CLIP tokenizer detect beginning of words by the preceding space). - trim_offsets (`bool`, *optional*, defaults to `True`): - Whether or not the post-processing step should trim offsets to avoid including whitespaces. """ vocab_files_names = VOCAB_FILES_NAMES @@ -113,7 +82,6 @@ class CLIPTokenizerFast(PreTrainedTokenizerFast): bos_token="<|startoftext|>", eos_token="<|endoftext|>", pad_token="<|endoftext|>", # hack to enable padding - add_prefix_space=False, **kwargs ): super().__init__( @@ -124,44 +92,81 @@ class CLIPTokenizerFast(PreTrainedTokenizerFast): bos_token=bos_token, eos_token=eos_token, pad_token=pad_token, - add_prefix_space=add_prefix_space, **kwargs, ) - pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__()) - if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space: - pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type")) - pre_tok_state["add_prefix_space"] = add_prefix_space - self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state) + if not isinstance(self.backend_tokenizer.pre_tokenizer, pre_tokenizers.Sequence): + raise ValueError( + "The `backend_tokenizer` provided does not match the expected format. The CLIP tokenizer has been " + "heavily modified from transformers version 4.17.0. You need to convert the tokenizer you are using to be compatible with this version." + "The easiest way to do so is " + '`CLIPTokenizerFast.from_pretrained("path_to_local_folder_or_hub_repo, from_slow=True)`.' + " If you want to use your existing tokenizer, you will have to revert to a version prior to " + "4.17.0 of transformers." + ) - self.add_prefix_space = add_prefix_space + self._wrap_decode_method_backend_tokenizer() - # Very ugly hack to enable padding - @property - def pad_token_id(self) -> Optional[int]: + # Very ugly hack to enable padding to have a correct decoding see https://github.com/huggingface/tokenizers/issues/872 + def _wrap_decode_method_backend_tokenizer(self): + orig_decode_method = self.backend_tokenizer.decode + + def new_decode_method(*args, **kwargs): + text = orig_decode_method(*args, **kwargs) + text = text.replace(self.backend_tokenizer.model.end_of_word_suffix, " ").strip() + return text + + self.backend_tokenizer.decode = new_decode_method + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: """ - `Optional[int]`: Id of the padding token in the vocabulary. Returns `None` if the token has not been set. + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A CLIP sequence has the following format: + + - single sequence: `<|startoftext|> X <|endoftext|>` + + Pairs of sequences are not the expected use case, but they will be handled without a separator. + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. """ - return 0 + bos_token = [self.bos_token_id] + eos_token = [self.eos_token_id] - def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding: - is_split_into_words = kwargs.get("is_split_into_words", False) - assert self.add_prefix_space or not is_split_into_words, ( - f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True " - "to use it with pretokenized inputs." - ) + if token_ids_1 is None: + return bos_token + token_ids_0 + eos_token + return bos_token + token_ids_0 + eos_token + eos_token + token_ids_1 + eos_token - return super()._batch_encode_plus(*args, **kwargs) + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed. CLIP does not make use of token type ids, therefore a list of + zeros is returned. - def _encode_plus(self, *args, **kwargs) -> BatchEncoding: - is_split_into_words = kwargs.get("is_split_into_words", False) + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. - assert self.add_prefix_space or not is_split_into_words, ( - f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True " - "to use it with pretokenized inputs." - ) + Returns: + `List[int]`: List of zeros. + """ + bos_token = [self.bos_token_id] + eos_token = [self.eos_token_id] - return super()._encode_plus(*args, **kwargs) + if token_ids_1 is None: + return len(bos_token + token_ids_0 + eos_token) * [0] + return len(bos_token + token_ids_0 + eos_token + eos_token + token_ids_1 + eos_token) * [0] def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: files = self._tokenizer.model.save(save_directory, name=filename_prefix) diff --git a/tests/test_tokenization_clip.py b/tests/test_tokenization_clip.py index 2f5ab7bd4..7bc31ece1 100644 --- a/tests/test_tokenization_clip.py +++ b/tests/test_tokenization_clip.py @@ -20,7 +20,7 @@ import unittest from transformers import CLIPTokenizer, CLIPTokenizerFast from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES -from transformers.testing_utils import require_tokenizers +from transformers.testing_utils import require_ftfy, require_tokenizers from .test_tokenization_common import TokenizerTesterMixin @@ -30,18 +30,20 @@ class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = CLIPTokenizer rust_tokenizer_class = CLIPTokenizerFast - test_rust_tokenizer = False - from_pretrained_kwargs = {"add_prefix_space": True} + test_rust_tokenizer = True + from_pretrained_kwargs = {} test_seq2seq = False def setUp(self): super().setUp() + # temporary addition: to test the new slow to fast converter + self.tokenizers_list = [(CLIPTokenizerFast, "SaulLu/clip-vit-base-patch32", {})] # fmt: off - vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "lo", "low", "er", "lowest", "newer", "wider", "", "<|endoftext|>"] + vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "lo", "l", "w", "r", "t", "low", "er", "lowest", "newer", "wider", "", "<|startoftext|>", "<|endoftext|>"] # fmt: on vocab_tokens = dict(zip(vocab, range(len(vocab)))) - merges = ["#version: 0.2", "l o", "lo w", "e r", ""] + merges = ["#version: 0.2", "l o", "lo w", "e r"] self.special_tokens_map = {"unk_token": ""} self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) @@ -61,148 +63,126 @@ class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase): def get_input_output_texts(self, tokenizer): input_text = "lower newer" - output_text = "lower newer " + output_text = "lower newer" return input_text, output_text def test_full_tokenizer(self): tokenizer = CLIPTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map) text = "lower newer" bpe_tokens = ["lo", "w", "er", "n", "e", "w", "er"] - tokens = tokenizer.tokenize(text, add_prefix_space=True) + tokens = tokenizer.tokenize(text) self.assertListEqual(tokens, bpe_tokens) input_tokens = tokens + [tokenizer.unk_token] - input_bpe_tokens = [10, 2, 12, 9, 3, 2, 12, 16] + input_bpe_tokens = [10, 2, 16, 9, 3, 2, 16, 20] self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) - def test_rust_and_python_full_tokenizers(self): - if not self.test_rust_tokenizer: - return - - tokenizer = self.get_tokenizer() - rust_tokenizer = self.get_rust_tokenizer(add_prefix_space=True) - - sequence = "lower newer" - - # Testing tokenization - tokens = tokenizer.tokenize(sequence, add_prefix_space=True) - rust_tokens = rust_tokenizer.tokenize(sequence) - self.assertListEqual(tokens, rust_tokens) - - # Testing conversion to ids without special tokens - ids = tokenizer.encode(sequence, add_special_tokens=False, add_prefix_space=True) - rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False) - self.assertListEqual(ids, rust_ids) - - # Testing conversion to ids with special tokens - rust_tokenizer = self.get_rust_tokenizer(add_prefix_space=True) - ids = tokenizer.encode(sequence, add_prefix_space=True) - rust_ids = rust_tokenizer.encode(sequence) - self.assertListEqual(ids, rust_ids) - - # Testing the unknown token - input_tokens = tokens + [rust_tokenizer.unk_token] - input_bpe_tokens = [10, 2, 12, 9, 3, 2, 12, 16] - self.assertListEqual(rust_tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) - - def test_pretokenized_inputs(self, *args, **kwargs): - # It's very difficult to mix/test pretokenization with byte-level - # And get both CLIP and Roberta to work at the same time (mostly an issue of adding a space before the string) - pass - - def test_padding(self, max_length=15): + @require_ftfy + def test_check_encoding_slow_fast(self): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + tokenizer_s = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - # Simple input - s = "This is a simple input" - s2 = ["This is a simple input 1", "This is a simple input 2"] - p = ("This is a simple input", "This is a pair") - p2 = [ - ("This is a simple input 1", "This is a simple input 2"), - ("This is a simple pair 1", "This is a simple pair 2"), + text = "A\n'll 11p223RF☆ho!!to?'d'd''d of a cat" + text_tokenized_s = tokenizer_s.tokenize(text) + text_tokenized_r = tokenizer_r.tokenize(text) + + self.assertListEqual(text_tokenized_s, text_tokenized_r) + + # Test that the tokenization is identical on an example containing a character (Latin Small Letter A + # with Tilde) encoded in 2 different ways + text = "xa\u0303y" + " " + "x\xe3y" + text_tokenized_s = tokenizer_s.tokenize(text) + text_tokenized_r = tokenizer_r.tokenize(text) + + self.assertListEqual(text_tokenized_s, text_tokenized_r) + + # Test that the tokenization is identical on unicode of space type + spaces_unicodes = [ + "\u0009", # (horizontal tab, '\t') + "\u000B", # (vertical tab) + "\u000C", # (form feed) + "\u0020", # (space, ' ') + "\u200E", # (left-to-right mark):w + "\u200F", # (right-to-left mark) + ] + for unicode_seq in spaces_unicodes: + text_tokenized_s = tokenizer_s.tokenize(unicode_seq) + text_tokenized_r = tokenizer_r.tokenize(unicode_seq) + + self.assertListEqual(text_tokenized_s, text_tokenized_r) + + # Test that the tokenization is identical on unicode of line break type + line_break_unicodes = [ + "\u000A", # (line feed, '\n') + "\r\n", # (carriage return and line feed, '\r\n') + "\u000D", # (carriage return, '\r') + "\r", # (carriage return, '\r') + "\u000D", # (carriage return, '\r') + "\u2028", # (line separator) + "\u2029", # (paragraph separator) + # "\u0085", # (next line) ] - # Simple input tests - self.assertRaises(ValueError, tokenizer_r.encode, s, max_length=max_length, padding="max_length") + # The tokenization is not identical for the character "\u0085" (next line). The slow version transforms + # it into the Horizontal Ellipsis character "…" ("\u2026") while the fast version transforms it into a + # space (and thus into an empty list). - # Simple input - self.assertRaises(ValueError, tokenizer_r.encode_plus, s, max_length=max_length, padding="max_length") + for unicode_seq in line_break_unicodes: + text_tokenized_s = tokenizer_s.tokenize(unicode_seq) + text_tokenized_r = tokenizer_r.tokenize(unicode_seq) - # Simple input - self.assertRaises( - ValueError, - tokenizer_r.batch_encode_plus, - s2, - max_length=max_length, - padding="max_length", + self.assertListEqual(text_tokenized_s, text_tokenized_r) + + def test_offsets_mapping_with_different_add_prefix_space_argument(self): + # Test which aims to verify that the offsets are well adapted to the argument `add_prefix_space` + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + text_of_1_token = "hello" # `hello` is a token in the vocabulary of `pretrained_name` + text = f"{text_of_1_token} {text_of_1_token}" + + tokenizer_r = self.rust_tokenizer_class.from_pretrained( + pretrained_name, + use_fast=True, + ) + encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False) + self.assertEqual(encoding.offset_mapping[0], (0, len(text_of_1_token))) + self.assertEqual( + encoding.offset_mapping[1], + (len(text_of_1_token) + 1, len(text_of_1_token) + 1 + len(text_of_1_token)), ) - # Pair input - self.assertRaises(ValueError, tokenizer_r.encode, p, max_length=max_length, padding="max_length") + text = f" {text}" - # Pair input - self.assertRaises(ValueError, tokenizer_r.encode_plus, p, max_length=max_length, padding="max_length") - - # Pair input - self.assertRaises( - ValueError, - tokenizer_r.batch_encode_plus, - p2, - max_length=max_length, - padding="max_length", + tokenizer_r = self.rust_tokenizer_class.from_pretrained( + pretrained_name, + use_fast=True, + ) + encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False) + self.assertEqual(encoding.offset_mapping[0], (1, 1 + len(text_of_1_token))) + self.assertEqual( + encoding.offset_mapping[1], + (1 + len(text_of_1_token) + 1, 1 + len(text_of_1_token) + 1 + len(text_of_1_token)), ) - def test_add_tokens_tokenizer(self): - tokenizers = self.get_tokenizers(do_lower_case=False) - for tokenizer in tokenizers: - with self.subTest(f"{tokenizer.__class__.__name__}"): - vocab_size = tokenizer.vocab_size - all_size = len(tokenizer) + def test_log_warning(self): + # Test related to the breaking change introduced in transformers v4.17.0 + # We need to check that an error in raised when the user try to load a previous version of the tokenizer. + with self.assertRaises(ValueError) as context: + self.rust_tokenizer_class.from_pretrained("robot-test/old-clip-tokenizer") - self.assertNotEqual(vocab_size, 0) + self.assertTrue( + context.exception.args[0].startswith( + "The `backend_tokenizer` provided does not match the expected format." + ) + ) - # We usually have added tokens from the start in tests because our vocab fixtures are - # smaller than the original vocabs - let's not assert this - # self.assertEqual(vocab_size, all_size) + @require_ftfy + def test_tokenization_python_rust_equals(self): + super().test_tokenization_python_rust_equals() - new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd"] - added_toks = tokenizer.add_tokens(new_toks) - vocab_size_2 = tokenizer.vocab_size - all_size_2 = len(tokenizer) - - self.assertNotEqual(vocab_size_2, 0) - self.assertEqual(vocab_size, vocab_size_2) - self.assertEqual(added_toks, len(new_toks)) - self.assertEqual(all_size_2, all_size + len(new_toks)) - - tokens = tokenizer.encode("aaaaa bbbbbb low cccccccccdddddddd l", add_special_tokens=False) - - self.assertGreaterEqual(len(tokens), 4) - self.assertGreater(tokens[0], tokenizer.vocab_size - 1) - self.assertGreater(tokens[-2], tokenizer.vocab_size - 1) - - new_toks_2 = {"eos_token": ">>>>|||<||<<|<<", "pad_token": "<<<<<|||>|>>>>|>"} - added_toks_2 = tokenizer.add_special_tokens(new_toks_2) - vocab_size_3 = tokenizer.vocab_size - all_size_3 = len(tokenizer) - - self.assertNotEqual(vocab_size_3, 0) - self.assertEqual(vocab_size, vocab_size_3) - self.assertEqual(added_toks_2, len(new_toks_2)) - self.assertEqual(all_size_3, all_size_2 + len(new_toks_2)) - - tokens = tokenizer.encode( - ">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l", add_special_tokens=False - ) - - self.assertGreaterEqual(len(tokens), 6) - self.assertGreater(tokens[0], tokenizer.vocab_size - 1) - self.assertGreater(tokens[0], tokens[1]) - self.assertGreater(tokens[-2], tokenizer.vocab_size - 1) - self.assertGreater(tokens[-2], tokens[-3]) - self.assertEqual(tokens[0], tokenizer.eos_token_id) - # padding is very hacky in CLIPTokenizer, pad_token_id is always 0 - # so skip this check - # self.assertEqual(tokens[-2], tokenizer.pad_token_id) + # overwrite common test + def test_added_tokens_do_lower_case(self): + # CLIP always lower cases letters + pass