mirror of
https://github.com/saymrwulf/transformers.git
synced 2026-05-15 21:01:19 +00:00
Improve tokenizer tests (#13594)
* Use new method to acquire tokenizers * Resolve TODOs. * Style * Fix * Enable do_lower_case in test_tokenize_special_tokens * Apply suggestion from code review * Fix mask token handling * Revert "Fix mask token handling" This reverts commit daaa3f5291b1f71e5bc3604ca281c000000c4648. * Fix FNet mask token tokenization * Complete everything * Apply suggestions from code review
This commit is contained in:
parent
6645eb61fa
commit
66ea739168
1 changed files with 38 additions and 32 deletions
|
|
@ -314,7 +314,7 @@ class TokenizerTesterMixin:
|
|||
# TODO: this test can be combined with `test_sentencepiece_tokenize_and_convert_tokens_to_string` after the latter is extended to all tokenizers.
|
||||
def test_tokenize_special_tokens(self):
|
||||
"""Test `tokenize` with special tokens."""
|
||||
tokenizers = self.get_tokenizers(fast=True)
|
||||
tokenizers = self.get_tokenizers(fast=True, do_lower_case=True)
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
SPECIAL_TOKEN_1 = "[SPECIAL_TOKEN_1]"
|
||||
|
|
@ -620,8 +620,7 @@ class TokenizerTesterMixin:
|
|||
self.assertEqual(tok1.__getstate__(), tok2.__getstate__())
|
||||
|
||||
def test_added_tokens_do_lower_case(self):
|
||||
# TODO(thom) activate fast tokenizer tests once Rust tokenizers accepts white spaces in added tokens.
|
||||
tokenizers = [self.get_tokenizer(do_lower_case=True)] if self.test_slow_tokenizer else []
|
||||
tokenizers = self.get_tokenizers(do_lower_case=True)
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
if not hasattr(tokenizer, "do_lower_case") or not tokenizer.do_lower_case:
|
||||
|
|
@ -632,30 +631,34 @@ class TokenizerTesterMixin:
|
|||
text = special_token + " aaaaa bbbbbb low cccccccccdddddddd l " + special_token
|
||||
text2 = special_token + " AAAAA BBBBBB low CCCCCCCCCDDDDDDDD l " + special_token
|
||||
|
||||
toks0 = tokenizer.tokenize(text) # toks before adding new_toks
|
||||
toks_before_adding = tokenizer.tokenize(text) # toks before adding new_toks
|
||||
|
||||
new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd", "AAAAA BBBBBB", "CCCCCCCCCDDDDDDDD"]
|
||||
added = tokenizer.add_tokens(new_toks)
|
||||
self.assertEqual(added, 2)
|
||||
added = tokenizer.add_tokens([AddedToken(tok, lstrip=True, rstrip=True) for tok in new_toks])
|
||||
|
||||
toks = tokenizer.tokenize(text)
|
||||
toks2 = tokenizer.tokenize(text2)
|
||||
toks_after_adding = tokenizer.tokenize(text)
|
||||
toks_after_adding2 = tokenizer.tokenize(text2)
|
||||
|
||||
self.assertEqual(len(toks), len(toks2))
|
||||
self.assertListEqual(toks, toks2)
|
||||
if not isinstance(tokenizer, PreTrainedTokenizerFast):
|
||||
# Python tokenizers can have added tokens with spaces inside them
|
||||
# cf https://github.com/huggingface/tokenizers/issues/302
|
||||
self.assertNotEqual(len(toks), len(toks0)) # toks0 should be longer
|
||||
# Rust tokenizers dont't lowercase added tokens at the time calling `tokenizer.add_tokens`,
|
||||
# while python tokenizers do, so new_toks 0 and 2 would be treated as the same, so do new_toks 1 and 3.
|
||||
self.assertIn(added, [2, 4])
|
||||
|
||||
self.assertListEqual(toks_after_adding, toks_after_adding2)
|
||||
self.assertTrue(
|
||||
len(toks_before_adding) > len(toks_after_adding), # toks_before_adding should be longer
|
||||
)
|
||||
|
||||
# Check that none of the special tokens are lowercased
|
||||
sequence_with_special_tokens = "A " + " yEs ".join(tokenizer.all_special_tokens) + " B"
|
||||
tokenized_sequence = tokenizer.tokenize(sequence_with_special_tokens)
|
||||
# Convert the tokenized list to str as some special tokens are tokenized like normal tokens
|
||||
# which have a prefix spacee e.g. the mask token of Albert, and cannot match the original
|
||||
# special tokens exactly.
|
||||
tokenized_sequence = "".join(tokenizer.tokenize(sequence_with_special_tokens))
|
||||
|
||||
for special_token in tokenizer.all_special_tokens:
|
||||
self.assertTrue(special_token in tokenized_sequence)
|
||||
|
||||
tokenizers = [self.get_tokenizer(do_lower_case=True)] if self.test_slow_tokenizer else []
|
||||
tokenizers = self.get_tokenizers(do_lower_case=True)
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
if hasattr(tokenizer, "do_lower_case") and tokenizer.do_lower_case:
|
||||
|
|
@ -666,22 +669,22 @@ class TokenizerTesterMixin:
|
|||
text = special_token + " aaaaa bbbbbb low cccccccccdddddddd l " + special_token
|
||||
text2 = special_token + " AAAAA BBBBBB low CCCCCCCCCDDDDDDDD l " + special_token
|
||||
|
||||
toks_before_adding = tokenizer.tokenize(text) # toks before adding new_toks
|
||||
|
||||
new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd", "AAAAA BBBBBB", "CCCCCCCCCDDDDDDDD"]
|
||||
|
||||
toks0 = tokenizer.tokenize(text) # toks before adding new_toks
|
||||
|
||||
added = tokenizer.add_tokens(new_toks)
|
||||
added = tokenizer.add_tokens([AddedToken(tok, lstrip=True, rstrip=True) for tok in new_toks])
|
||||
self.assertIn(added, [2, 4])
|
||||
|
||||
toks = tokenizer.tokenize(text)
|
||||
toks2 = tokenizer.tokenize(text2)
|
||||
toks_after_adding = tokenizer.tokenize(text)
|
||||
toks_after_adding2 = tokenizer.tokenize(text2)
|
||||
|
||||
self.assertEqual(len(toks), len(toks2)) # Length should still be the same
|
||||
self.assertNotEqual(toks[1], toks2[1]) # But at least the first non-special tokens should differ
|
||||
if not isinstance(tokenizer, PreTrainedTokenizerFast):
|
||||
# Python tokenizers can have added tokens with spaces inside them
|
||||
# cf https://github.com/huggingface/tokenizers/issues/302
|
||||
self.assertNotEqual(len(toks), len(toks0)) # toks0 should be longer
|
||||
self.assertEqual(len(toks_after_adding), len(toks_after_adding2)) # Length should still be the same
|
||||
self.assertNotEqual(
|
||||
toks_after_adding[1], toks_after_adding2[1]
|
||||
) # But at least the first non-special tokens should differ
|
||||
self.assertTrue(
|
||||
len(toks_before_adding) > len(toks_after_adding), # toks_before_adding should be longer
|
||||
)
|
||||
|
||||
def test_add_tokens_tokenizer(self):
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False)
|
||||
|
|
@ -780,12 +783,15 @@ class TokenizerTesterMixin:
|
|||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
|
||||
# new_toks = ["[ABC]", "[DEF]"] # TODO(thom) add this one back when Rust toks are ready: , "GHI IHG"]
|
||||
new_toks = [AddedToken("[ABC]", normalized=False), AddedToken("[DEF]", normalized=False)]
|
||||
new_toks = [
|
||||
AddedToken("[ABC]", normalized=False),
|
||||
AddedToken("[DEF]", normalized=False),
|
||||
AddedToken("GHI IHG", normalized=False),
|
||||
]
|
||||
tokenizer.add_tokens(new_toks)
|
||||
input = "[ABC][DEF][ABC][DEF]" # TODO(thom) add back cf above: "[ABC] [DEF] [ABC] GHI IHG [DEF]"
|
||||
input = "[ABC][DEF][ABC]GHI IHG[DEF]"
|
||||
if self.space_between_special_tokens:
|
||||
output = "[ABC] [DEF] [ABC] [DEF]"
|
||||
output = "[ABC] [DEF] [ABC] GHI IHG [DEF]"
|
||||
else:
|
||||
output = input
|
||||
encoded = tokenizer.encode(input, add_special_tokens=False)
|
||||
|
|
|
|||
Loading…
Reference in a new issue