From 6d3d5b1039559a7cc03965cc8c2a6ff0291d83fc Mon Sep 17 00:00:00 2001
From: Yasmin Moslem <48152713+ymoslem@users.noreply.github.com>
Date: Thu, 23 May 2024 17:53:26 +0100
Subject: [PATCH] Remove deprecated properties in tokenization_nllb.py and
 tokenization_nllb_fast.py (#29834)

* Fix typo in tokenization_nllb.py

Change `adder_tokens_decoder` into `added_tokens_decoder` and improve the warning's readability.

* Fix typo in tokenization_nllb_fast.py

Change `adder_tokens_decoder` into `added_tokens_decoder` and improve the warning's readability.

* Remove deprecated attributes in tokenization_nllb.py

Remove deprecated attributes: `lang_code_to_id`, `fairseq_tokens_to_ids`, `id_to_lang_code`, and `fairseq_ids_to_tokens`

* Remove deprecated attribute in tokenization_nllb_fast.py

Remove deprecated attribute `lang_code_to_id`

* Remove deprecated properties in tokenization_nllb.py

Remove deprecated properties - fix format

* Remove deprecated properties in tokenization_nllb_fast.py

Remove deprecated properties - fix format

* Update test_tokenization_nllb.py

* update test_tokenization_nllb.py

* Update tokenization_nllb.py

* Update test_tokenization_seamless_m4t.py

* Update test_tokenization_seamless_m4t.py
---
 .../models/nllb/tokenization_nllb.py          | 44 -------------------
 .../models/nllb/tokenization_nllb_fast.py     | 12 -----
 tests/models/nllb/test_tokenization_nllb.py   | 12 -----
 .../test_tokenization_seamless_m4t.py         |  9 ----
 4 files changed, 77 deletions(-)
diff --git a/src/transformers/models/nllb/tokenization_nllb.py b/src/transformers/models/nllb/tokenization_nllb.py
index f51712115..b5ae28b81 100644
--- a/src/transformers/models/nllb/tokenization_nllb.py
+++ b/src/transformers/models/nllb/tokenization_nllb.py
@@ -159,18 +159,6 @@ class NllbTokenizer(PreTrainedTokenizer):
         self.fairseq_offset = 1
         self.sp_model_size = len(self.sp_model)
 
-        # Everything that follows is kept for BC and will be removed in v4.38
-        self._fairseq_tokens_to_ids = {"<s>": 0, "<pad>": 1, "</s>": 2, "<unk>": 3}
-        language_codes = FAIRSEQ_LANGUAGE_CODES if additional_special_tokens is None else additional_special_tokens
-        self._lang_code_to_id = {
-            code: self.sp_model_size + i + self.fairseq_offset for i, code in enumerate(language_codes)
-        }
-        self._id_to_lang_code = {v: k for k, v in self._lang_code_to_id.items()}
-        self._fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset
-
-        self._fairseq_tokens_to_ids.update(self.lang_code_to_id)
-        self._fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
-
         super().__init__(
             bos_token=bos_token,
             eos_token=eos_token,
@@ -217,38 +205,6 @@ class NllbTokenizer(PreTrainedTokenizer):
     def src_lang(self) -> str:
         return self._src_lang
 
-    @property
-    def lang_code_to_id(self):
-        logger.warning_once(
-            "the `lang_code_to_id` attribute is deprecated. The logic is natively handled in the `tokenizer.adder_tokens_decoder`"
-            " this attribute will be removed in `transformers` v4.38"
-        )
-        return self._lang_code_to_id
-
-    @property
-    def fairseq_tokens_to_ids(self):
-        logger.warning_once(
-            "the `fairseq_tokens_to_ids` attribute is deprecated. The logic is natively handled in the `tokenizer.adder_tokens_decoder`"
-            " this attribute will be removed in `transformers` v4.38"
-        )
-        return self._fairseq_tokens_to_ids
-
-    @property
-    def id_to_lang_code(self):
-        logger.warning_once(
-            "the `id_to_lang_code` attribute is deprecated. The logic is natively handled in the `tokenizer.adder_tokens_decoder`"
-            " this attribute will be removed in `transformers` v4.38"
-        )
-        return self._id_to_lang_code
-
-    @property
-    def fairseq_ids_to_tokens(self):
-        logger.warning_once(
-            "the `_fairseq_ids_to_tokens` attribute is deprecated. The logic is natively handled in the `tokenizer.adder_tokens_decoder`"
-            " this attribute will be removed in `transformers` v4.38"
-        )
-        return self._fairseq_ids_to_tokens
-
     @src_lang.setter
     def src_lang(self, new_src_lang: str) -> None:
         self._src_lang = new_src_lang
diff --git a/src/transformers/models/nllb/tokenization_nllb_fast.py b/src/transformers/models/nllb/tokenization_nllb_fast.py
index 2004580bf..013dbc97b 100644
--- a/src/transformers/models/nllb/tokenization_nllb_fast.py
+++ b/src/transformers/models/nllb/tokenization_nllb_fast.py
@@ -161,23 +161,11 @@ class NllbTokenizerFast(PreTrainedTokenizerFast):
             **kwargs,
         )
 
-        self._lang_code_to_id = {
-            lang_code: self.convert_tokens_to_ids(str(lang_code)) for lang_code in additional_special_tokens
-        }
-
         self._src_lang = src_lang if src_lang is not None else "eng_Latn"
         self.cur_lang_code = self.convert_tokens_to_ids(self._src_lang)
         self.tgt_lang = tgt_lang
         self.set_src_lang_special_tokens(self._src_lang)
 
-    @property
-    def lang_code_to_id(self):
-        logger.warning_once(
-            "the `lang_code_to_id` attribute is deprecated. The logic is natively handled in the `tokenizer.adder_tokens_decoder`"
-            " this attribute will be removed in `transformers` v4.38"
-        )
-        return self._lang_code_to_id
-
     @property
     def can_save_slow_tokenizer(self) -> bool:
         return os.path.isfile(self.vocab_file) if self.vocab_file else False
diff --git a/tests/models/nllb/test_tokenization_nllb.py b/tests/models/nllb/test_tokenization_nllb.py
index 92134c3f8..b0ef147db 100644
--- a/tests/models/nllb/test_tokenization_nllb.py
+++ b/tests/models/nllb/test_tokenization_nllb.py
@@ -367,11 +367,6 @@ class NllbDistilledIntegrationTest(unittest.TestCase):
         cls.pad_token_id = 1
         return cls
 
-    def test_language_codes(self):
-        self.assertEqual(self.tokenizer.fairseq_tokens_to_ids["ace_Arab"], 256001)
-        self.assertEqual(self.tokenizer.fairseq_tokens_to_ids["ace_Latn"], 256002)
-        self.assertEqual(self.tokenizer.fairseq_tokens_to_ids["fra_Latn"], 256057)
-
     def test_enro_tokenizer_batch_encode_plus(self):
         ids = self.tokenizer.batch_encode_plus(self.src_text).input_ids[0]
         self.assertListEqual(self.expected_src_tokens, ids)
@@ -397,13 +392,6 @@ class NllbDistilledIntegrationTest(unittest.TestCase):
     def test_mask_token(self):
         self.assertListEqual(self.tokenizer.convert_tokens_to_ids(["<mask>", "ar_AR"]), [256203, 3])
 
-    def test_special_tokens_unaffacted_by_save_load(self):
-        tmpdirname = tempfile.mkdtemp()
-        original_special_tokens = self.tokenizer.fairseq_tokens_to_ids
-        self.tokenizer.save_pretrained(tmpdirname)
-        new_tok = NllbTokenizer.from_pretrained(tmpdirname)
-        self.assertDictEqual(new_tok.fairseq_tokens_to_ids, original_special_tokens)
-
     @require_torch
     def test_enro_tokenizer_prepare_batch(self):
         batch = self.tokenizer(
diff --git a/tests/models/seamless_m4t/test_tokenization_seamless_m4t.py b/tests/models/seamless_m4t/test_tokenization_seamless_m4t.py
index 2e65d01ea..7ccc04bc5 100644
--- a/tests/models/seamless_m4t/test_tokenization_seamless_m4t.py
+++ b/tests/models/seamless_m4t/test_tokenization_seamless_m4t.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import tempfile
 import unittest
 
 from transformers import (
@@ -499,14 +498,6 @@ class SeamlessM4TDistilledIntegrationTest(unittest.TestCase):
         self.assertEqual(ids[0], EN_CODE)
         self.assertEqual(len(ids), desired_max_length)
 
-    # Copied from tests.models.nllb.test_tokenization_nllb.NllbDistilledIntegrationTest.test_special_tokens_unaffacted_by_save_load with fairseq_tokens_to_ids->additional_special_tokens, Nllb->SeamlessM4T, Dict->List
-    def test_special_tokens_unaffacted_by_save_load(self):
-        tmpdirname = tempfile.mkdtemp()
-        original_special_tokens = self.tokenizer.additional_special_tokens
-        self.tokenizer.save_pretrained(tmpdirname)
-        new_tok = SeamlessM4TTokenizer.from_pretrained(tmpdirname)
-        self.assertListEqual(new_tok.additional_special_tokens, original_special_tokens)
-
     @require_torch
     def test_enro_tokenizer_prepare_batch(self):
         batch = self.tokenizer(