mirror of
https://github.com/saymrwulf/transformers.git
synced 2026-05-14 20:58:08 +00:00
Add documentation for BertJapanese (#11219)
* Start writing BERT-Japanese doc * Fix typo, Update toctree * Modify model file to use comment for document, Add examples * Clean bert_japanese by make style * Apply suggestions from code review Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Split a big code block into two * Apply suggestions from code review Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Add prefix >>> to all lines in code blocks * Clean bert_japanese by make fixup Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
This commit is contained in:
parent
896d7be974
commit
22fa0a6004
3 changed files with 99 additions and 19 deletions
|
|
@ -442,6 +442,7 @@ TensorFlow and/or Flax.
|
|||
model_doc/bert
|
||||
model_doc/bertweet
|
||||
model_doc/bertgeneration
|
||||
model_doc/bert_japanese
|
||||
model_doc/bigbird
|
||||
model_doc/blenderbot
|
||||
model_doc/blenderbot_small
|
||||
|
|
|
|||
78
docs/source/model_doc/bert_japanese.rst
Normal file
78
docs/source/model_doc/bert_japanese.rst
Normal file
|
|
@ -0,0 +1,78 @@
|
|||
..
|
||||
Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
|
||||
BertJapanese
|
||||
-----------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
Overview
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
The BERT models trained on Japanese text.
|
||||
|
||||
There are models with two different tokenization methods:
|
||||
|
||||
- Tokenize with MeCab and WordPiece. This requires some extra dependencies, `fugashi
|
||||
<https://github.com/polm/fugashi>`__ which is a wrapper around `MeCab <https://taku910.github.io/mecab/>`__.
|
||||
- Tokenize into characters.
|
||||
|
||||
To use `MecabTokenizer`, you should ``pip install transformers["ja"]`` (or ``pip install -e .["ja"]`` if you install
|
||||
from source) to install dependencies.
|
||||
|
||||
See `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__.
|
||||
|
||||
Example of using a model with MeCab and WordPiece tokenization:
|
||||
|
||||
.. code-block::
|
||||
|
||||
>>> import torch
|
||||
>>> from transformers import AutoModel, AutoTokenizer
|
||||
|
||||
>>> bertjapanese = AutoModel.from_pretrained("cl-tohoku/bert-base-japanese")
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese")
|
||||
|
||||
>>> ## Input Japanese Text
|
||||
>>> line = "吾輩は猫である。"
|
||||
|
||||
>>> inputs = tokenizer(line, return_tensors="pt")
|
||||
|
||||
>>> print(tokenizer.decode(inputs['input_ids'][0]))
|
||||
[CLS] 吾輩 は 猫 で ある 。 [SEP]
|
||||
|
||||
>>> outputs = bertjapanese(**inputs)
|
||||
|
||||
Example of using a model with Character tokenization:
|
||||
|
||||
.. code-block::
|
||||
|
||||
>>> bertjapanese = AutoModel.from_pretrained("cl-tohoku/bert-base-japanese-char")
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-char")
|
||||
|
||||
>>> ## Input Japanese Text
|
||||
>>> line = "吾輩は猫である。"
|
||||
|
||||
>>> inputs = tokenizer(line, return_tensors="pt")
|
||||
|
||||
>>> print(tokenizer.decode(inputs['input_ids'][0]))
|
||||
[CLS] 吾 輩 は 猫 で あ る 。 [SEP]
|
||||
|
||||
>>> outputs = bertjapanese(**inputs)
|
||||
|
||||
Tips:
|
||||
|
||||
- This implementation is the same as BERT, except for tokenization method. Refer to the :doc:`documentation of BERT
|
||||
<bert>` for more usage examples.
|
||||
|
||||
BertJapaneseTokenizer
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. autoclass:: transformers.BertJapaneseTokenizer
|
||||
:members:
|
||||
|
|
@ -70,7 +70,25 @@ PRETRAINED_INIT_CONFIGURATION = {
|
|||
|
||||
|
||||
class BertJapaneseTokenizer(BertTokenizer):
|
||||
"""BERT tokenizer for Japanese text"""
|
||||
r"""
|
||||
Construct a BERT tokenizer for Japanese text, based on a MecabTokenizer.
|
||||
|
||||
Args:
|
||||
vocab_file (:obj:`str`):
|
||||
Path to a one-wordpiece-per-line vocabulary file.
|
||||
do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether to lower case the input. Only has an effect when do_basic_tokenize=True.
|
||||
do_word_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether to do word tokenization.
|
||||
do_subword_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether to do subword tokenization.
|
||||
word_tokenizer_type (:obj:`str`, `optional`, defaults to :obj:`"basic"`):
|
||||
Type of word tokenizer.
|
||||
subword_tokenizer_type (:obj:`str`, `optional`, defaults to :obj:`"wordpiece"`):
|
||||
Type of subword tokenizer.
|
||||
mecab_kwargs (:obj:`str`, `optional`):
|
||||
Dictionary passed to the :obj:`MecabTokenizer` constructor.
|
||||
"""
|
||||
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||
|
|
@ -94,23 +112,6 @@ class BertJapaneseTokenizer(BertTokenizer):
|
|||
mecab_kwargs=None,
|
||||
**kwargs
|
||||
):
|
||||
"""
|
||||
Constructs a MecabBertTokenizer.
|
||||
|
||||
Args:
|
||||
**vocab_file**: Path to a one-wordpiece-per-line vocabulary file.
|
||||
**do_lower_case**: (`optional`) boolean (default True)
|
||||
Whether to lower case the input. Only has an effect when do_basic_tokenize=True.
|
||||
**do_word_tokenize**: (`optional`) boolean (default True)
|
||||
Whether to do word tokenization.
|
||||
**do_subword_tokenize**: (`optional`) boolean (default True)
|
||||
Whether to do subword tokenization.
|
||||
**word_tokenizer_type**: (`optional`) string (default "basic")
|
||||
Type of word tokenizer.
|
||||
**subword_tokenizer_type**: (`optional`) string (default "wordpiece")
|
||||
Type of subword tokenizer.
|
||||
**mecab_kwargs**: (`optional`) dict passed to `MecabTokenizer` constructor (default None)
|
||||
"""
|
||||
super(BertTokenizer, self).__init__(
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
|
|
@ -230,7 +231,7 @@ class MecabTokenizer:
|
|||
import fugashi
|
||||
except ModuleNotFoundError as error:
|
||||
raise error.__class__(
|
||||
"You need to install fugashi to use MecabTokenizer."
|
||||
"You need to install fugashi to use MecabTokenizer. "
|
||||
"See https://pypi.org/project/fugashi/ for installation."
|
||||
)
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue