diff --git a/scripts/pegasus/build_test_sample_spm_no_bos.py b/scripts/pegasus/build_test_sample_spm_no_bos.py new file mode 100755 index 000000000..92ec94c42 --- /dev/null +++ b/scripts/pegasus/build_test_sample_spm_no_bos.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python + +# this script builds a small sample spm file tests/fixtures/test_sentencepiece_no_bos.model, with features needed by pegasus + +# 1. pip install sentencepiece +# +# 2. wget https://raw.githubusercontent.com/google/sentencepiece/master/data/botchan.txt + +# 3. build +import sentencepiece as spm + +# pegasus: +# 1. no bos +# 2. eos_id is 1 +# 3. unk_id is 2 +# build a sample spm file accordingly +spm.SentencePieceTrainer.train('--input=botchan.txt --model_prefix=test_sentencepiece_no_bos --bos_id=-1 --unk_id=2 --eos_id=1 --vocab_size=1000') + +# 4. now update the fixture +# mv test_sentencepiece_no_bos.model ../../tests/fixtures/ diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py index 1dfbf66e5..afc70672e 100644 --- a/src/transformers/testing_utils.py +++ b/src/transformers/testing_utils.py @@ -184,13 +184,23 @@ def require_faiss(test_case): return test_case -def get_tests_dir(): +def get_tests_dir(append_path=None): """ - returns the full path to the `tests` dir, so that the tests can be invoked from anywhere + Args: + append_path: optional path to append to the tests dir path + + Return: + The full path to the `tests` dir, so that the tests can be invoked from anywhere. + Optionally `append_path` is joined after the `tests` dir the former is provided. + """ # this function caller's __file__ caller__file__ = inspect.stack()[1][1] - return os.path.abspath(os.path.dirname(caller__file__)) + tests_dir = os.path.abspath(os.path.dirname(caller__file__)) + if append_path: + return os.path.join(tests_dir, append_path) + else: + return tests_dir # diff --git a/src/transformers/tokenization_pegasus.py b/src/transformers/tokenization_pegasus.py index 346bcdb58..e29d7439d 100644 --- a/src/transformers/tokenization_pegasus.py +++ b/src/transformers/tokenization_pegasus.py @@ -49,7 +49,7 @@ class PegasusTokenizer(ReformerTokenizer): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - # Dont use reserved words added_token_encoder, added_tokens_decoder because of + # Don't use reserved words added_token_encoder, added_tokens_decoder because of # AssertionError: Non-consecutive added token '1' found. in from_pretrained assert len(self.added_tokens_decoder) == 0 self.encoder: Dict[int, str] = {0: self.pad_token, 1: self.eos_token} @@ -58,7 +58,7 @@ class PegasusTokenizer(ReformerTokenizer): self.decoder: Dict[str, int] = {v: k for k, v in self.encoder.items()} def _convert_token_to_id(self, token: str) -> int: - """ Converts a token (str) in an id using the vocab. """ + """ Converts a token (str) to an id using the vocab. """ if token in self.decoder: return self.decoder[token] elif token in self.added_tokens_decoder: @@ -67,7 +67,7 @@ class PegasusTokenizer(ReformerTokenizer): return sp_id + self.offset def _convert_id_to_token(self, index: int) -> str: - """Converts an index (integer) in a token (str) using the vocab.""" + """Converts an index (integer) to a token (str) using the vocab.""" if index in self.encoder: return self.encoder[index] elif index in self.added_tokens_encoder: @@ -81,11 +81,6 @@ class PegasusTokenizer(ReformerTokenizer): def vocab_size(self) -> int: return len(self.sp_model) + self.offset - def get_vocab(self) -> Dict[str, int]: - vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} - vocab.update(self.added_tokens_encoder) - return vocab - def num_special_tokens_to_add(self, pair=False): """Just EOS""" return 1 @@ -109,12 +104,12 @@ class PegasusTokenizer(ReformerTokenizer): def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]: """ - Build model inputs from a sequence or a pair of sequence for sequence classification tasks + Build model inputs from a sequence or a pair of sequences for sequence classification tasks by concatenating and adding special tokens. A Pegasus sequence has the following format, where ``X`` represents the sequence: - single sequence: ``X `` - - pair of sequences: ``A B `` (not intended use) + - pair of sequences: ``A B `` (not intended use) BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a separator. diff --git a/src/transformers/tokenization_reformer.py b/src/transformers/tokenization_reformer.py index 017e4a346..a2b2e7856 100644 --- a/src/transformers/tokenization_reformer.py +++ b/src/transformers/tokenization_reformer.py @@ -17,6 +17,7 @@ import os from shutil import copyfile +from typing import Dict from .tokenization_utils import PreTrainedTokenizer from .tokenization_utils_fast import PreTrainedTokenizerFast @@ -119,7 +120,7 @@ class ReformerTokenizer(PreTrainedTokenizer): def vocab_size(self): return self.sp_model.get_piece_size() - def get_vocab(self): + def get_vocab(self) -> Dict[str, int]: vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} vocab.update(self.added_tokens_encoder) return vocab diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py index 188b8ac76..b4d8829f4 100644 --- a/src/transformers/tokenization_utils.py +++ b/src/transformers/tokenization_utils.py @@ -186,7 +186,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase): num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2']) print('We have added', num_added_toks, 'tokens') - # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer. + # Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer. model.resize_token_embeddings(len(tokenizer)) """ new_tokens = [str(tok) for tok in new_tokens] diff --git a/tests/fixtures/test_sentencepiece_no_bos.model b/tests/fixtures/test_sentencepiece_no_bos.model new file mode 100644 index 000000000..c3336ae60 Binary files /dev/null and b/tests/fixtures/test_sentencepiece_no_bos.model differ diff --git a/tests/test_tokenization_pegasus.py b/tests/test_tokenization_pegasus.py index 3943322bf..ba3e84058 100644 --- a/tests/test_tokenization_pegasus.py +++ b/tests/test_tokenization_pegasus.py @@ -1,13 +1,15 @@ import unittest -from pathlib import Path from transformers.file_utils import cached_property -from transformers.testing_utils import require_torch +from transformers.testing_utils import get_tests_dir, require_torch from transformers.tokenization_pegasus import PegasusTokenizer, PegasusTokenizerFast from .test_tokenization_common import TokenizerTesterMixin +SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece_no_bos.model") + + class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = PegasusTokenizer @@ -17,11 +19,9 @@ class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase): def setUp(self): super().setUp() - save_dir = Path(self.tmpdirname) - spm_file = PegasusTokenizer.vocab_files_names["vocab_file"] - if not (save_dir / spm_file).exists(): - tokenizer = self.pegasus_large_tokenizer - tokenizer.save_pretrained(self.tmpdirname) + # We have a SentencePiece fixture for testing + tokenizer = PegasusTokenizer(SAMPLE_VOCAB) + tokenizer.save_pretrained(self.tmpdirname) @cached_property def pegasus_large_tokenizer(self): @@ -32,10 +32,7 @@ class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase): pass def get_tokenizer(self, **kwargs) -> PegasusTokenizer: - if not kwargs: - return self.pegasus_large_tokenizer - else: - return PegasusTokenizer.from_pretrained(self.tmpdirname, **kwargs) + return PegasusTokenizer.from_pretrained(self.tmpdirname, **kwargs) def get_input_output_texts(self, tokenizer): return ("This is a test", "This is a test") diff --git a/tests/test_tokenization_t5.py b/tests/test_tokenization_t5.py index 234e9f91f..9b670478c 100644 --- a/tests/test_tokenization_t5.py +++ b/tests/test_tokenization_t5.py @@ -14,19 +14,18 @@ # limitations under the License. -import os import unittest from transformers import BatchEncoding from transformers.file_utils import cached_property -from transformers.testing_utils import _torch_available +from transformers.testing_utils import _torch_available, get_tests_dir from transformers.tokenization_t5 import T5Tokenizer, T5TokenizerFast from transformers.tokenization_xlnet import SPIECE_UNDERLINE from .test_tokenization_common import TokenizerTesterMixin -SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model") +SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") FRAMEWORK = "pt" if _torch_available else "tf"