From 5787e4c159e92e75d5e16396a0ef64d71e25d186 Mon Sep 17 00:00:00 2001 From: Anthony MOI Date: Mon, 6 Jul 2020 18:27:53 -0400 Subject: [PATCH] Various tokenizers fixes (#5558) * BertTokenizerFast - Do not specify strip_accents by default * Bump tokenizers to new version * Add test for AddedToken serialization --- setup.py | 2 +- src/transformers/tokenization_bert.py | 2 +- tests/test_tokenization_common.py | 7 ++++ tests/test_tokenization_fast.py | 56 ++++++++++++++++----------- 4 files changed, 42 insertions(+), 25 deletions(-) diff --git a/setup.py b/setup.py index eb8e6b211..feb9e2840 100644 --- a/setup.py +++ b/setup.py @@ -114,7 +114,7 @@ setup( packages=find_packages("src"), install_requires=[ "numpy", - "tokenizers == 0.8.0-rc4", + "tokenizers == 0.8.1.rc1", # dataclasses for Python versions that don't have it "dataclasses;python_version<'3.7'", # utilities from PyPA to e.g. compare versions diff --git a/src/transformers/tokenization_bert.py b/src/transformers/tokenization_bert.py index b168fe96f..5b5bd311e 100644 --- a/src/transformers/tokenization_bert.py +++ b/src/transformers/tokenization_bert.py @@ -606,7 +606,7 @@ class BertTokenizerFast(PreTrainedTokenizerFast): mask_token="[MASK]", clean_text=True, tokenize_chinese_chars=True, - strip_accents=True, + strip_accents=None, wordpieces_prefix="##", **kwargs ): diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index d23de55ef..b9c866cdc 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -24,6 +24,7 @@ from typing import TYPE_CHECKING, Dict, List, Tuple, Union from transformers import PreTrainedTokenizer, PreTrainedTokenizerBase, PreTrainedTokenizerFast from transformers.testing_utils import require_tf, require_torch, slow +from transformers.tokenization_utils import AddedToken if TYPE_CHECKING: @@ -233,6 +234,12 @@ class TokenizerTesterMixin: self.assertListEqual(subwords, subwords_loaded) + def test_pickle_added_tokens(self): + tok1 = AddedToken("", rstrip=True, lstrip=True, normalized=False, single_word=True) + tok2 = pickle.loads(pickle.dumps(tok1)) + + self.assertEqual(tok1.__getstate__(), tok2.__getstate__()) + def test_added_tokens_do_lower_case(self): # TODO(thom) activate fast tokenizer tests once Rust tokenizers accepts white spaces in added tokens tokenizers = self.get_tokenizers(fast=False, do_lower_case=True) diff --git a/tests/test_tokenization_fast.py b/tests/test_tokenization_fast.py index 0682df672..9cd45c7e3 100644 --- a/tests/test_tokenization_fast.py +++ b/tests/test_tokenization_fast.py @@ -91,8 +91,6 @@ class CommonFastTokenizerTest(unittest.TestCase): self.assert_padding(tokenizer_r, tokenizer_p) self.assert_create_token_type_ids(tokenizer_r, tokenizer_p) self.assert_prepare_for_model(tokenizer_r, tokenizer_p) - # TODO: enable for v3.0.0 - # self.assert_empty_output_no_special_tokens(tokenizer_r, tokenizer_p) def fast_only(self, tokenizer_r): # Ensure None raise an error @@ -748,29 +746,41 @@ class WordPieceFastTokenizerTest(CommonFastTokenizerTest): add_special_tokens=True, ) - expected_results = [ - ((0, 1), "A"), - ((1, 2), ","), - ((3, 8), "naive"), # BERT normalizes this away - # Append MASK here after lower-casing - ((16, 21), "Allen"), - ((22, 24), "##NL"), - ((24, 25), "##P"), - ((26, 34), "sentence"), - ((35, 36), "."), - ] - - # Check if the tokenizer is uncased - if tokenizer_r.init_kwargs.get("do_lower_case"): - expected_results = [(offset, token.lower()) for (offset, token) in expected_results] - - # Append the special tokens - expected_results.insert(3, ((9, 15), "[MASK]")) - expected_results.insert(0, (None, "[CLS]")) - expected_results.append((None, "[SEP]")) + do_lower_case = tokenizer_r.init_kwargs.get("do_lower_case") + expected_results = ( + [ + ((0, 0), "[CLS]"), + ((0, 1), "A"), + ((1, 2), ","), + ((3, 5), "na"), + ((5, 6), "##ï"), + ((6, 8), "##ve"), + ((9, 15), "[MASK]"), + ((16, 21), "Allen"), + ((21, 23), "##NL"), + ((23, 24), "##P"), + ((25, 33), "sentence"), + ((33, 34), "."), + ((0, 0), "[SEP]"), + ] + if not do_lower_case + else [ + ((0, 0), "[CLS]"), + ((0, 1), "a"), + ((1, 2), ","), + ((3, 8), "naive"), + ((9, 15), "[MASK]"), + ((16, 21), "allen"), + ((21, 23), "##nl"), + ((23, 24), "##p"), + ((25, 33), "sentence"), + ((33, 34), "."), + ((0, 0), "[SEP]"), + ] + ) self.assertEqual([e[1] for e in expected_results], tokenizer_r.convert_ids_to_tokens(tokens["input_ids"])) - # self.assertEqual([e[0] for e in expected_results], tokens["offset_mapping"]) + self.assertEqual([e[0] for e in expected_results], tokens["offset_mapping"]) class RobertaFastTokenizerTest(CommonFastTokenizerTest):