From 6d3d5b1039559a7cc03965cc8c2a6ff0291d83fc Mon Sep 17 00:00:00 2001 From: Yasmin Moslem <48152713+ymoslem@users.noreply.github.com> Date: Thu, 23 May 2024 17:53:26 +0100 Subject: [PATCH] Remove deprecated properties in tokenization_nllb.py and tokenization_nllb_fast.py (#29834) * Fix typo in tokenization_nllb.py Change `adder_tokens_decoder` into `added_tokens_decoder` and improve the warning's readability. * Fix typo in tokenization_nllb_fast.py Change `adder_tokens_decoder` into `added_tokens_decoder` and improve the warning's readability. * Remove deprecated attributes in tokenization_nllb.py Remove deprecated attributes: `lang_code_to_id`, `fairseq_tokens_to_ids`, `id_to_lang_code`, and `fairseq_ids_to_tokens` * Remove deprecated attribute in tokenization_nllb_fast.py Remove deprecated attribute `lang_code_to_id` * Remove deprecated properties in tokenization_nllb.py Remove deprecated properties - fix format * Remove deprecated properties in tokenization_nllb_fast.py Remove deprecated properties - fix format * Update test_tokenization_nllb.py * update test_tokenization_nllb.py * Update tokenization_nllb.py * Update test_tokenization_seamless_m4t.py * Update test_tokenization_seamless_m4t.py --- .../models/nllb/tokenization_nllb.py | 44 ------------------- .../models/nllb/tokenization_nllb_fast.py | 12 ----- tests/models/nllb/test_tokenization_nllb.py | 12 ----- .../test_tokenization_seamless_m4t.py | 9 ---- 4 files changed, 77 deletions(-) diff --git a/src/transformers/models/nllb/tokenization_nllb.py b/src/transformers/models/nllb/tokenization_nllb.py index f51712115..b5ae28b81 100644 --- a/src/transformers/models/nllb/tokenization_nllb.py +++ b/src/transformers/models/nllb/tokenization_nllb.py @@ -159,18 +159,6 @@ class NllbTokenizer(PreTrainedTokenizer): self.fairseq_offset = 1 self.sp_model_size = len(self.sp_model) - # Everything that follows is kept for BC and will be removed in v4.38 - self._fairseq_tokens_to_ids = {"": 0, "": 1, "": 2, "": 3} - language_codes = FAIRSEQ_LANGUAGE_CODES if additional_special_tokens is None else additional_special_tokens - self._lang_code_to_id = { - code: self.sp_model_size + i + self.fairseq_offset for i, code in enumerate(language_codes) - } - self._id_to_lang_code = {v: k for k, v in self._lang_code_to_id.items()} - self._fairseq_tokens_to_ids[""] = len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset - - self._fairseq_tokens_to_ids.update(self.lang_code_to_id) - self._fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()} - super().__init__( bos_token=bos_token, eos_token=eos_token, @@ -217,38 +205,6 @@ class NllbTokenizer(PreTrainedTokenizer): def src_lang(self) -> str: return self._src_lang - @property - def lang_code_to_id(self): - logger.warning_once( - "the `lang_code_to_id` attribute is deprecated. The logic is natively handled in the `tokenizer.adder_tokens_decoder`" - " this attribute will be removed in `transformers` v4.38" - ) - return self._lang_code_to_id - - @property - def fairseq_tokens_to_ids(self): - logger.warning_once( - "the `fairseq_tokens_to_ids` attribute is deprecated. The logic is natively handled in the `tokenizer.adder_tokens_decoder`" - " this attribute will be removed in `transformers` v4.38" - ) - return self._fairseq_tokens_to_ids - - @property - def id_to_lang_code(self): - logger.warning_once( - "the `id_to_lang_code` attribute is deprecated. The logic is natively handled in the `tokenizer.adder_tokens_decoder`" - " this attribute will be removed in `transformers` v4.38" - ) - return self._id_to_lang_code - - @property - def fairseq_ids_to_tokens(self): - logger.warning_once( - "the `_fairseq_ids_to_tokens` attribute is deprecated. The logic is natively handled in the `tokenizer.adder_tokens_decoder`" - " this attribute will be removed in `transformers` v4.38" - ) - return self._fairseq_ids_to_tokens - @src_lang.setter def src_lang(self, new_src_lang: str) -> None: self._src_lang = new_src_lang diff --git a/src/transformers/models/nllb/tokenization_nllb_fast.py b/src/transformers/models/nllb/tokenization_nllb_fast.py index 2004580bf..013dbc97b 100644 --- a/src/transformers/models/nllb/tokenization_nllb_fast.py +++ b/src/transformers/models/nllb/tokenization_nllb_fast.py @@ -161,23 +161,11 @@ class NllbTokenizerFast(PreTrainedTokenizerFast): **kwargs, ) - self._lang_code_to_id = { - lang_code: self.convert_tokens_to_ids(str(lang_code)) for lang_code in additional_special_tokens - } - self._src_lang = src_lang if src_lang is not None else "eng_Latn" self.cur_lang_code = self.convert_tokens_to_ids(self._src_lang) self.tgt_lang = tgt_lang self.set_src_lang_special_tokens(self._src_lang) - @property - def lang_code_to_id(self): - logger.warning_once( - "the `lang_code_to_id` attribute is deprecated. The logic is natively handled in the `tokenizer.adder_tokens_decoder`" - " this attribute will be removed in `transformers` v4.38" - ) - return self._lang_code_to_id - @property def can_save_slow_tokenizer(self) -> bool: return os.path.isfile(self.vocab_file) if self.vocab_file else False diff --git a/tests/models/nllb/test_tokenization_nllb.py b/tests/models/nllb/test_tokenization_nllb.py index 92134c3f8..b0ef147db 100644 --- a/tests/models/nllb/test_tokenization_nllb.py +++ b/tests/models/nllb/test_tokenization_nllb.py @@ -367,11 +367,6 @@ class NllbDistilledIntegrationTest(unittest.TestCase): cls.pad_token_id = 1 return cls - def test_language_codes(self): - self.assertEqual(self.tokenizer.fairseq_tokens_to_ids["ace_Arab"], 256001) - self.assertEqual(self.tokenizer.fairseq_tokens_to_ids["ace_Latn"], 256002) - self.assertEqual(self.tokenizer.fairseq_tokens_to_ids["fra_Latn"], 256057) - def test_enro_tokenizer_batch_encode_plus(self): ids = self.tokenizer.batch_encode_plus(self.src_text).input_ids[0] self.assertListEqual(self.expected_src_tokens, ids) @@ -397,13 +392,6 @@ class NllbDistilledIntegrationTest(unittest.TestCase): def test_mask_token(self): self.assertListEqual(self.tokenizer.convert_tokens_to_ids(["", "ar_AR"]), [256203, 3]) - def test_special_tokens_unaffacted_by_save_load(self): - tmpdirname = tempfile.mkdtemp() - original_special_tokens = self.tokenizer.fairseq_tokens_to_ids - self.tokenizer.save_pretrained(tmpdirname) - new_tok = NllbTokenizer.from_pretrained(tmpdirname) - self.assertDictEqual(new_tok.fairseq_tokens_to_ids, original_special_tokens) - @require_torch def test_enro_tokenizer_prepare_batch(self): batch = self.tokenizer( diff --git a/tests/models/seamless_m4t/test_tokenization_seamless_m4t.py b/tests/models/seamless_m4t/test_tokenization_seamless_m4t.py index 2e65d01ea..7ccc04bc5 100644 --- a/tests/models/seamless_m4t/test_tokenization_seamless_m4t.py +++ b/tests/models/seamless_m4t/test_tokenization_seamless_m4t.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import tempfile import unittest from transformers import ( @@ -499,14 +498,6 @@ class SeamlessM4TDistilledIntegrationTest(unittest.TestCase): self.assertEqual(ids[0], EN_CODE) self.assertEqual(len(ids), desired_max_length) - # Copied from tests.models.nllb.test_tokenization_nllb.NllbDistilledIntegrationTest.test_special_tokens_unaffacted_by_save_load with fairseq_tokens_to_ids->additional_special_tokens, Nllb->SeamlessM4T, Dict->List - def test_special_tokens_unaffacted_by_save_load(self): - tmpdirname = tempfile.mkdtemp() - original_special_tokens = self.tokenizer.additional_special_tokens - self.tokenizer.save_pretrained(tmpdirname) - new_tok = SeamlessM4TTokenizer.from_pretrained(tmpdirname) - self.assertListEqual(new_tok.additional_special_tokens, original_special_tokens) - @require_torch def test_enro_tokenizer_prepare_batch(self): batch = self.tokenizer(