From 699bc7e86ea2253bfb2f011f006180b2e49f0703 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 12 Jul 2019 11:46:57 +0200 Subject: [PATCH] fix gpt-2 unk token test --- docs/README.md | 2 +- pytorch_transformers/tokenization_gpt2.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/README.md b/docs/README.md index c39ecda0d..1b3c1fead 100644 --- a/docs/README.md +++ b/docs/README.md @@ -57,4 +57,4 @@ It should build the static app that will be available under `/docs/_build/html` ## Adding a new element to the tree (toc-tree) Accepted files are reStructuredText (.rst) and Markdown (.md). Create a file with its extension and put it -in the source directory. You can then link it to the toc-tree by putting the filename without the extension. \ No newline at end of file +in the source directory. You can then link it to the toc-tree by putting the filename without the extension. diff --git a/pytorch_transformers/tokenization_gpt2.py b/pytorch_transformers/tokenization_gpt2.py index 6084dc3e0..bd90a9225 100644 --- a/pytorch_transformers/tokenization_gpt2.py +++ b/pytorch_transformers/tokenization_gpt2.py @@ -177,7 +177,9 @@ class GPT2Tokenizer(PreTrainedTokenizer): def _convert_token_to_id(self, token): """ Converts a token (str/unicode) in an id using the vocab. """ - return self.encoder.get(token) + if token in self.encoder: + return self.encoder.get(token) + return self.encoder.get(self.unk_token) def _convert_id_to_token(self, index): """Converts an index (integer) in a token (string/unicode) using the vocab."""