mirror of
https://github.com/saymrwulf/transformers.git
synced 2026-05-14 20:58:08 +00:00
Adding usage examples for common tasks (#2850)
* Usage: Sequence Classification & Question Answering
* Pipeline example
* Language modeling
* TensorFlow code for Sequence classification
* Custom TF/PT toggler in docs
* QA + LM for TensorFlow
* Finish Usage for both PyTorch and TensorFlow
* Addressing Julien's comments
* More assertive
* cleanup
* Favicon
- added favicon option in conf.py along with the favicon image
- udpated 🤗 logo. slightly smaller and should appear more consistent across editing programs (no more tongue on the outside of the mouth)
Co-authored-by: joshchagani <joshua@joshuachagani.com>
This commit is contained in:
parent
e693cd1e87
commit
65e7c90a77
7 changed files with 697 additions and 48 deletions
|
|
@ -1,3 +1,25 @@
|
|||
/* Our DOM objects */
|
||||
|
||||
.framework-selector {
|
||||
display: flex;
|
||||
flex-direction: row;
|
||||
justify-content: flex-end;
|
||||
}
|
||||
|
||||
.framework-selector > button {
|
||||
background-color: white;
|
||||
color: #6670FF;
|
||||
border: 1px solid #6670FF;
|
||||
padding: 5px;
|
||||
}
|
||||
|
||||
.framework-selector > button.selected{
|
||||
background-color: #6670FF;
|
||||
color: white;
|
||||
border: 1px solid #6670FF;
|
||||
padding: 5px;
|
||||
}
|
||||
|
||||
/* The literal code blocks */
|
||||
.rst-content tt.literal, .rst-content tt.literal, .rst-content code.literal {
|
||||
color: #6670FF;
|
||||
|
|
|
|||
|
|
@ -68,6 +68,74 @@ function addHfMenu() {
|
|||
document.body.insertAdjacentHTML('afterbegin', div);
|
||||
}
|
||||
|
||||
function platformToggle() {
|
||||
const codeBlocks = Array.from(document.getElementsByClassName("highlight"));
|
||||
const pytorchIdentifier = "## PYTORCH CODE";
|
||||
const tensorflowIdentifier = "## TENSORFLOW CODE";
|
||||
const pytorchSpanIdentifier = `<span class="c1">${pytorchIdentifier}</span>`;
|
||||
const tensorflowSpanIdentifier = `<span class="c1">${tensorflowIdentifier}</span>`;
|
||||
|
||||
const getFrameworkSpans = filteredCodeBlock => {
|
||||
const spans = filteredCodeBlock.element.innerHTML;
|
||||
const pytorchSpanPosition = spans.indexOf(pytorchSpanIdentifier);
|
||||
const tensorflowSpanPosition = spans.indexOf(tensorflowSpanIdentifier);
|
||||
|
||||
let pytorchSpans;
|
||||
let tensorflowSpans;
|
||||
|
||||
if(pytorchSpanPosition < tensorflowSpanPosition){
|
||||
pytorchSpans = spans.slice(pytorchSpanPosition + pytorchSpanIdentifier.length + 1, tensorflowSpanPosition);
|
||||
tensorflowSpans = spans.slice(tensorflowSpanPosition + tensorflowSpanIdentifier.length + 1, spans.length);
|
||||
}else{
|
||||
tensorflowSpans = spans.slice(tensorflowSpanPosition + tensorflowSpanIdentifier.length + 1, pytorchSpanPosition);
|
||||
pytorchSpans = spans.slice(pytorchSpanPosition + pytorchSpanIdentifier.length + 1, spans.length);
|
||||
}
|
||||
|
||||
return {
|
||||
...filteredCodeBlock,
|
||||
pytorchSample: pytorchSpans ,
|
||||
tensorflowSample: tensorflowSpans
|
||||
}
|
||||
};
|
||||
|
||||
const createFrameworkButtons = sample => {
|
||||
const pytorchButton = document.createElement("button");
|
||||
pytorchButton.innerText = "PyTorch";
|
||||
|
||||
const tensorflowButton = document.createElement("button");
|
||||
tensorflowButton.innerText = "TensorFlow";
|
||||
|
||||
const selectorDiv = document.createElement("div");
|
||||
selectorDiv.classList.add("framework-selector");
|
||||
selectorDiv.appendChild(pytorchButton);
|
||||
selectorDiv.appendChild(tensorflowButton);
|
||||
sample.element.parentElement.prepend(selectorDiv);
|
||||
|
||||
// Init on PyTorch
|
||||
sample.element.innerHTML = sample.pytorchSample;
|
||||
pytorchButton.classList.add("selected");
|
||||
tensorflowButton.classList.remove("selected");
|
||||
|
||||
pytorchButton.addEventListener("click", () => {
|
||||
sample.element.innerHTML = sample.pytorchSample;
|
||||
pytorchButton.classList.add("selected");
|
||||
tensorflowButton.classList.remove("selected");
|
||||
});
|
||||
tensorflowButton.addEventListener("click", () => {
|
||||
sample.element.innerHTML = sample.tensorflowSample;
|
||||
tensorflowButton.classList.add("selected");
|
||||
pytorchButton.classList.remove("selected");
|
||||
});
|
||||
};
|
||||
|
||||
codeBlocks
|
||||
.map(element => {return {element: element.firstChild, innerText: element.innerText}})
|
||||
.filter(codeBlock => codeBlock.innerText.includes(pytorchIdentifier) && codeBlock.innerText.includes(tensorflowIdentifier))
|
||||
.map(getFrameworkSpans)
|
||||
.forEach(createFrameworkButtons);
|
||||
}
|
||||
|
||||
|
||||
/*!
|
||||
* github-buttons v2.2.10
|
||||
* (c) 2019 なつき
|
||||
|
|
@ -85,6 +153,7 @@ function onLoad() {
|
|||
addGithubButton();
|
||||
parseGithubButtons();
|
||||
addHfMenu();
|
||||
platformToggle();
|
||||
}
|
||||
|
||||
window.addEventListener("load", onLoad);
|
||||
|
|
|
|||
File diff suppressed because one or more lines are too long
|
Before Width: | Height: | Size: 14 KiB After Width: | Height: | Size: 7.6 KiB |
|
|
@ -20,7 +20,7 @@ sys.path.insert(0, os.path.abspath('../../src'))
|
|||
# -- Project information -----------------------------------------------------
|
||||
|
||||
project = u'transformers'
|
||||
copyright = u'2019, huggingface'
|
||||
copyright = u'2020, huggingface'
|
||||
author = u'huggingface'
|
||||
|
||||
# The short X.Y version
|
||||
|
|
@ -105,6 +105,12 @@ html_static_path = ['_static']
|
|||
#
|
||||
# html_sidebars = {}
|
||||
|
||||
# This must be the name of an image file (path relative to the configuration
|
||||
# directory) that is the favicon of the docs. Modern browsers use this as
|
||||
# the icon for tabs, windows and bookmarks. It should be a Windows-style
|
||||
# icon file (.ico).
|
||||
html_favicon = 'favicon.ico'
|
||||
|
||||
|
||||
# -- Options for HTMLHelp output ---------------------------------------------
|
||||
|
||||
|
|
|
|||
BIN
docs/source/favicon.ico
Normal file
BIN
docs/source/favicon.ico
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 47 KiB |
|
|
@ -61,6 +61,7 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train
|
|||
quickstart
|
||||
glossary
|
||||
pretrained_models
|
||||
usage
|
||||
model_sharing
|
||||
examples
|
||||
notebooks
|
||||
|
|
|
|||
597
docs/source/usage.rst
Normal file
597
docs/source/usage.rst
Normal file
|
|
@ -0,0 +1,597 @@
|
|||
Usage
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
This page shows the most frequent use-cases when using the library. The models available allow for many different
|
||||
configurations and a great versatility in use-cases. The most simple ones are presented here, showcasing usage
|
||||
for tasks such as question answering, sequence classification, named entity recognition and others.
|
||||
|
||||
These examples leverage auto-models, which are classes that will instantiate a model according to a given checkpoint,
|
||||
automatically selecting the correct model architecture. Please check the :class:`~transformers.AutoModel` documentation
|
||||
for more information.
|
||||
Feel free to modify the code to be more specific and adapt it to your specific use-case.
|
||||
|
||||
In order for a model to perform well on a task, it must be loaded from a checkpoint corresponding to that task. These
|
||||
checkpoints are usually pre-trained on a large corpus of data and fine-tuned on a specific task. This means the
|
||||
following:
|
||||
|
||||
- Not all models were fine-tuned on all tasks. If you want to fine-tune a model on a specific task, you can leverage
|
||||
one of the `run_$TASK.py` script in the
|
||||
`examples <https://github.com/huggingface/transformers/tree/master/examples>`_ directory.
|
||||
- Fine-tuned models were fine-tuned on a specific dataset. This dataset may or may not overlap with your use-case
|
||||
and domain. As mentioned previously, you may leverage the
|
||||
`examples <https://github.com/huggingface/transformers/tree/master/examples>`_ scripts to fine-tune your model, or you
|
||||
may create your own training script.
|
||||
|
||||
In order to do an inference on a task, several mechanisms are made available by the library:
|
||||
|
||||
- Pipelines: very easy-to-use abstractions, which require as little as two lines of code.
|
||||
- Using a model directly with a tokenizer (PyTorch/TensorFlow): the full inference using the model. Less abstraction,
|
||||
but much more powerful.
|
||||
|
||||
Both approaches are showcased here.
|
||||
|
||||
.. note::
|
||||
|
||||
All tasks presented here leverage pre-trained checkpoints that were fine-tuned on specific tasks. Loading a
|
||||
checkpoint that was not fine-tuned on a specific task would load only the base transformer layers and not the
|
||||
additional head that is used for the task, initializing the weights of that head randomly.
|
||||
|
||||
This would produce random output.
|
||||
|
||||
Sequence Classification
|
||||
--------------------------
|
||||
|
||||
Sequence classification is the task of classifying sequences according to a given number of classes. An example
|
||||
of sequence classification is the GLUE dataset, which is entirely based on that task. If you would like to fine-tune
|
||||
a model on a GLUE sequence classification task, you may leverage the
|
||||
`run_glue.py <https://github.com/huggingface/transformers/tree/master/examples/run_glue.py>`_ or
|
||||
`run_tf_glue.py <https://github.com/huggingface/transformers/tree/master/examples/run_tf_glue.py>`_ scripts.
|
||||
|
||||
Here is an example using the pipelines do to sentiment analysis: identifying if a sequence is positive or negative.
|
||||
It leverages a fine-tuned model on sst2, which is a GLUE task.
|
||||
|
||||
::
|
||||
|
||||
from transformers import pipeline
|
||||
|
||||
nlp = pipeline("sentiment-analysis")
|
||||
|
||||
print(nlp("I hate you"))
|
||||
print(nlp("I love you"))
|
||||
|
||||
This returns a label ("POSITIVE" or "NEGATIVE") alongside a score, as follows:
|
||||
|
||||
::
|
||||
|
||||
[{'label': 'NEGATIVE', 'score': 0.9991129}]
|
||||
[{'label': 'POSITIVE', 'score': 0.99986565}]
|
||||
|
||||
|
||||
Here is an example of doing a sequence classification using a model to determine if two sequences are paraphrases
|
||||
of each other. The process is the following:
|
||||
|
||||
- Instantiate a tokenizer and a model from the checkpoint name. The model is identified as a BERT model and loads it
|
||||
with the weights stored in the checkpoint.
|
||||
- Build a sequence from the two sentences, with the correct model-specific separators token type ids
|
||||
and attention masks (:func:`~transformers.PreTrainedTokenizer.encode` and
|
||||
:func:`~transformers.PreTrainedTokenizer.encode_plus` take care of this)
|
||||
- Pass this sequence through the model so that it is classified in one of the two available classes: 0
|
||||
(not a paraphrase) and 1 (is a paraphrase)
|
||||
- Compute the softmax of the result to get probabilities over the classes
|
||||
- Print the results
|
||||
|
||||
::
|
||||
|
||||
## PYTORCH CODE
|
||||
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
||||
import torch
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
|
||||
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc")
|
||||
|
||||
classes = ["not paraphrase", "is paraphrase"]
|
||||
|
||||
sequence_0 = "The company HuggingFace is based in New York City"
|
||||
sequence_1 = "Apples are especially bad for your health"
|
||||
sequence_2 = "HuggingFace's headquarters are situated in Manhattan"
|
||||
|
||||
paraphrase = tokenizer.encode_plus(sequence_0, sequence_2, return_tensors="pt")
|
||||
not_paraphrase = tokenizer.encode_plus(sequence_0, sequence_1, return_tensors="pt")
|
||||
|
||||
paraphrase_classification_logits = model(**paraphrase)[0]
|
||||
not_paraphrase_classification_logits = model(**not_paraphrase)[0]
|
||||
|
||||
paraphrase_results = torch.softmax(paraphrase_classification_logits, dim=1).tolist()[0]
|
||||
not_paraphrase_results = torch.softmax(not_paraphrase_classification_logits, dim=1).tolist()[0]
|
||||
|
||||
print("Should be paraphrase")
|
||||
for i in range(len(classes)):
|
||||
print(f"{classes[i]}: {round(paraphrase_results[i] * 100)}%")
|
||||
|
||||
print("\nShould not be paraphrase")
|
||||
for i in range(len(classes)):
|
||||
print(f"{classes[i]}: {round(not_paraphrase_results[i] * 100)}%")
|
||||
## TENSORFLOW CODE
|
||||
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
|
||||
import tensorflow as tf
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
|
||||
model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc")
|
||||
|
||||
classes = ["not paraphrase", "is paraphrase"]
|
||||
|
||||
sequence_0 = "The company HuggingFace is based in New York City"
|
||||
sequence_1 = "Apples are especially bad for your health"
|
||||
sequence_2 = "HuggingFace's headquarters are situated in Manhattan"
|
||||
|
||||
paraphrase = tokenizer.encode_plus(sequence_0, sequence_2, return_tensors="tf")
|
||||
not_paraphrase = tokenizer.encode_plus(sequence_0, sequence_1, return_tensors="tf")
|
||||
|
||||
paraphrase_classification_logits = model(paraphrase)[0]
|
||||
not_paraphrase_classification_logits = model(not_paraphrase)[0]
|
||||
|
||||
paraphrase_results = tf.nn.softmax(paraphrase_classification_logits, axis=1).numpy()[0]
|
||||
not_paraphrase_results = tf.nn.softmax(not_paraphrase_classification_logits, axis=1).numpy()[0]
|
||||
|
||||
print("Should be paraphrase")
|
||||
for i in range(len(classes)):
|
||||
print(f"{classes[i]}: {round(paraphrase_results[i] * 100)}%")
|
||||
|
||||
print("\nShould not be paraphrase")
|
||||
for i in range(len(classes)):
|
||||
print(f"{classes[i]}: {round(not_paraphrase_results[i] * 100)}%")
|
||||
|
||||
This outputs the following results:
|
||||
|
||||
::
|
||||
|
||||
Should be paraphrase
|
||||
not paraphrase: 10%
|
||||
is paraphrase: 90%
|
||||
|
||||
Should not be paraphrase
|
||||
not paraphrase: 94%
|
||||
is paraphrase: 6%
|
||||
|
||||
Extractive Question Answering
|
||||
----------------------------------------------------
|
||||
|
||||
Extractive Question Answering is the task of extracting an answer from a text given a question. An example of a
|
||||
question answering dataset is the SQuAD dataset, which is entirely based on that task. If you would like to fine-tune
|
||||
a model on a SQuAD task, you may leverage the `run_squad.py`.
|
||||
|
||||
Here is an example using the pipelines do to question answering: extracting an answer from a text given a question.
|
||||
It leverages a fine-tuned model on SQuAD.
|
||||
|
||||
::
|
||||
|
||||
from transformers import pipeline
|
||||
|
||||
nlp = pipeline("question-answering")
|
||||
|
||||
context = r"""
|
||||
Extractive Question Answering is the task of extracting an answer from a text given a question. An example of a
|
||||
question answering dataset is the SQuAD dataset, which is entirely based on that task. If you would like to fine-tune
|
||||
a model on a SQuAD task, you may leverage the `run_squad.py`.
|
||||
"""
|
||||
|
||||
print(nlp(question="What is extractive question answering?", context=context))
|
||||
print(nlp(question="What is a good example of a question answering dataset?", context=context))
|
||||
|
||||
This returns an answer extracted from the text, a confidence score, alongside "start" and "end" values which
|
||||
are the positions of the extracted answer in the text.
|
||||
|
||||
::
|
||||
|
||||
{'score': 0.622232091629833, 'start': 34, 'end': 96, 'answer': 'the task of extracting an answer from a text given a question.'}
|
||||
{'score': 0.5115299158662765, 'start': 147, 'end': 161, 'answer': 'SQuAD dataset,'}
|
||||
|
||||
|
||||
Here is an example of question answering using a model and a tokenizer. The process is the following:
|
||||
|
||||
- Instantiate a tokenizer and a model from the checkpoint name. The model is identified as a BERT model and loads it
|
||||
with the weights stored in the checkpoint.
|
||||
- Define a text and a few questions.
|
||||
- Iterate over the questions and build a sequence from the text and the current question, with the correct
|
||||
model-specific separators token type ids and attention masks
|
||||
- Pass this sequence through the model. This outputs a range of scores across the entire sequence tokens (question and
|
||||
text), for both the start and end positions.
|
||||
- Compute the softmax of the result to get probabilities over the tokens
|
||||
- Fetch the tokens from the identified start and stop values, convert those tokens to a string.
|
||||
- Print the results
|
||||
|
||||
::
|
||||
|
||||
## PYTORCH CODE
|
||||
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
|
||||
import torch
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
|
||||
model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
|
||||
|
||||
text = r"""
|
||||
🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
|
||||
architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
|
||||
Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
|
||||
TensorFlow 2.0 and PyTorch.
|
||||
"""
|
||||
|
||||
questions = [
|
||||
"How many pretrained models are available in Transformers?",
|
||||
"What does Transformers provide?",
|
||||
"Transformers provides interoperability between which frameworks?",
|
||||
]
|
||||
|
||||
for question in questions:
|
||||
inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="pt")
|
||||
input_ids = inputs["input_ids"].tolist()[0]
|
||||
|
||||
text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
|
||||
answer_start_scores, answer_end_scores = model(**inputs)
|
||||
|
||||
answer_start = torch.argmax(
|
||||
answer_start_scores
|
||||
) # Get the most likely beginning of answer with the argmax of the score
|
||||
answer_end = torch.argmax(answer_end_scores) + 1 # Get the most likely end of answer with the argmax of the score
|
||||
|
||||
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
|
||||
|
||||
print(f"Question: {question}")
|
||||
print(f"Answer: {answer}\n")
|
||||
## TENSORFLOW CODE
|
||||
from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering
|
||||
import tensorflow as tf
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
|
||||
model = TFAutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
|
||||
|
||||
text = r"""
|
||||
🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
|
||||
architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
|
||||
Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
|
||||
TensorFlow 2.0 and PyTorch.
|
||||
"""
|
||||
|
||||
questions = [
|
||||
"How many pretrained models are available in Transformers?",
|
||||
"What does Transformers provide?",
|
||||
"Transformers provides interoperability between which frameworks?",
|
||||
]
|
||||
|
||||
for question in questions:
|
||||
inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="tf")
|
||||
input_ids = inputs["input_ids"].numpy()[0]
|
||||
|
||||
text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
|
||||
answer_start_scores, answer_end_scores = model(inputs)
|
||||
|
||||
answer_start = tf.argmax(
|
||||
answer_start_scores, axis=1
|
||||
).numpy()[0] # Get the most likely beginning of answer with the argmax of the score
|
||||
answer_end = (
|
||||
tf.argmax(answer_end_scores, axis=1) + 1
|
||||
).numpy()[0] # Get the most likely end of answer with the argmax of the score
|
||||
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
|
||||
|
||||
print(f"Question: {question}")
|
||||
print(f"Answer: {answer}\n")
|
||||
|
||||
This outputs the questions followed by the predicted answers:
|
||||
|
||||
::
|
||||
|
||||
Question: How many pretrained models are available in Transformers?
|
||||
Answer: over 32 +
|
||||
|
||||
Question: What does Transformers provide?
|
||||
Answer: general - purpose architectures
|
||||
|
||||
Question: Transformers provides interoperability between which frameworks?
|
||||
Answer: tensorflow 2 . 0 and pytorch
|
||||
|
||||
|
||||
|
||||
Language Modeling
|
||||
----------------------------------------------------
|
||||
|
||||
Language modeling is the task of fitting a model to a corpus, which can be domain specific. All popular transformer
|
||||
based models are trained using a variant of language modeling, e.g. BERT with masked language modeling, GPT-2 with
|
||||
causal language modeling.
|
||||
|
||||
Language modeling can be useful outside of pre-training as well, for example to shift the model distribution to be
|
||||
domain-specific: using a language model trained over a very large corpus, and then fine-tuning it to a news dataset
|
||||
or on scientific papers e.g. `LysandreJik/arxiv-nlp <https://huggingface.co/lysandre/arxiv-nlp>`__.
|
||||
|
||||
Masked Language Modeling
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Masked language modeling is the task of masking tokens in a sequence with a masking token, and prompting the model to
|
||||
fill that mask with an appropriate token. This allows the model to attend to both the right context (tokens on the
|
||||
right of the mask) and the left context (tokens on the left of the mask). Such a training creates a strong basis
|
||||
for downstream tasks requiring bi-directional context such as SQuAD (question answering,
|
||||
see `Lewis, Lui, Goyal et al. <https://arxiv.org/abs/1910.13461>`__, part 4.2).
|
||||
|
||||
Here is an example of using pipelines to replace a mask from a sequence:
|
||||
|
||||
::
|
||||
|
||||
from transformers import pipeline
|
||||
|
||||
nlp = pipeline("fill-mask")
|
||||
print(nlp(f"HuggingFace is creating a {nlp.tokenizer.mask_token} that the community uses to solve NLP tasks."))
|
||||
|
||||
This outputs the sequences with the mask filled, the confidence score as well as the token id in the tokenizer
|
||||
vocabulary:
|
||||
|
||||
::
|
||||
|
||||
[
|
||||
{'sequence': '<s> HuggingFace is creating a tool that the community uses to solve NLP tasks.</s>', 'score': 0.15627853572368622, 'token': 3944},
|
||||
{'sequence': '<s> HuggingFace is creating a framework that the community uses to solve NLP tasks.</s>', 'score': 0.11690319329500198, 'token': 7208},
|
||||
{'sequence': '<s> HuggingFace is creating a library that the community uses to solve NLP tasks.</s>', 'score': 0.058063216507434845, 'token': 5560},
|
||||
{'sequence': '<s> HuggingFace is creating a database that the community uses to solve NLP tasks.</s>', 'score': 0.04211743175983429, 'token': 8503},
|
||||
{'sequence': '<s> HuggingFace is creating a prototype that the community uses to solve NLP tasks.</s>', 'score': 0.024718601256608963, 'token': 17715}
|
||||
]
|
||||
|
||||
Here is an example doing masked language modeling using a model and a tokenizer. The process is the following:
|
||||
|
||||
- Instantiate a tokenizer and a model from the checkpoint name. The model is identified as a DistilBERT model and
|
||||
loads it with the weights stored in the checkpoint.
|
||||
- Define a sequence with a masked token, placing the :obj:`tokenizer.mask_token` instead of a word.
|
||||
- Encode that sequence into IDs and find the position of the masked token in that list of IDs.
|
||||
- Retrieve the predictions at the index of the mask token: this tensor has the same size as the vocabulary, and the
|
||||
values are the scores attributed to each token. The model gives higher score to tokens he deems probable in that
|
||||
context.
|
||||
- Retrieve the top 5 tokens using the PyTorch :obj:`topk` or TensorFlow :obj:`top_k` methods.
|
||||
- Replace the mask token by the tokens and print the results
|
||||
|
||||
::
|
||||
|
||||
## PYTORCH CODE
|
||||
from transformers import AutoModelWithLMHead, AutoTokenizer
|
||||
import torch
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
|
||||
model = AutoModelWithLMHead.from_pretrained("distilbert-base-cased")
|
||||
|
||||
sequence = f"Distilled models are smaller than the models they mimic. Using them instead of the large versions would help {tokenizer.mask_token} our carbon footprint."
|
||||
|
||||
input = tokenizer.encode(sequence, return_tensors="pt")
|
||||
mask_token_index = torch.where(input == tokenizer.mask_token_id)[1]
|
||||
|
||||
token_logits = model(input)[0]
|
||||
mask_token_logits = token_logits[0, mask_token_index, :]
|
||||
|
||||
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
|
||||
|
||||
for token in top_5_tokens:
|
||||
print(sequence.replace(tokenizer.mask_token, tokenizer.decode([token])))
|
||||
## TENSORFLOW CODE
|
||||
from transformers import TFAutoModelWithLMHead, AutoTokenizer
|
||||
import tensorflow as tf
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
|
||||
model = TFAutoModelWithLMHead.from_pretrained("distilbert-base-cased")
|
||||
|
||||
sequence = f"Distilled models are smaller than the models they mimic. Using them instead of the large versions would help {tokenizer.mask_token} our carbon footprint."
|
||||
|
||||
input = tokenizer.encode(sequence, return_tensors="tf")
|
||||
mask_token_index = tf.where(input == tokenizer.mask_token_id)[0, 1]
|
||||
|
||||
token_logits = model(input)[0]
|
||||
mask_token_logits = token_logits[0, mask_token_index, :]
|
||||
|
||||
top_5_tokens = tf.math.top_k(mask_token_logits, 5).indices.numpy()
|
||||
|
||||
for token in top_5_tokens:
|
||||
print(sequence.replace(tokenizer.mask_token, tokenizer.decode([token])))
|
||||
|
||||
This prints five sequences, with the top 5 tokens predicted by the model:
|
||||
|
||||
::
|
||||
|
||||
Distilled models are smaller than the models they mimic. Using them instead of the large versions would help reduce our carbon footprint.
|
||||
Distilled models are smaller than the models they mimic. Using them instead of the large versions would help increase our carbon footprint.
|
||||
Distilled models are smaller than the models they mimic. Using them instead of the large versions would help decrease our carbon footprint.
|
||||
Distilled models are smaller than the models they mimic. Using them instead of the large versions would help offset our carbon footprint.
|
||||
Distilled models are smaller than the models they mimic. Using them instead of the large versions would help improve our carbon footprint.
|
||||
|
||||
|
||||
Causal Language Modeling
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Causal language modeling is the task of predicting the token following a sequence of tokens. In this situation, the
|
||||
model only attends to the left context (tokens on the left of the mask). Such a training is particularly interesting
|
||||
for generation tasks.
|
||||
|
||||
There is currently no pipeline to do causal language modeling/generation.
|
||||
|
||||
Here is an example using the tokenizer and model. leveraging the :func:`~transformers.PreTrainedModel.generate` method
|
||||
to generate the tokens following the initial sequence in PyTorch, and creating a simple loop in TensorFlow.
|
||||
|
||||
::
|
||||
|
||||
## PYTORCH CODE
|
||||
from transformers import AutoModelWithLMHead, AutoTokenizer
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("gpt2")
|
||||
model = AutoModelWithLMHead.from_pretrained("gpt2")
|
||||
|
||||
sequence = f"Hugging Face is based in DUMBO, New York City, and is"
|
||||
|
||||
input = tokenizer.encode(sequence, return_tensors="pt")
|
||||
generated = model.generate(input, max_length=50)
|
||||
|
||||
resulting_string = tokenizer.decode(generated.tolist()[0])
|
||||
print(resulting_string)
|
||||
## TENSORFLOW CODE
|
||||
from transformers import TFAutoModelWithLMHead, AutoTokenizer
|
||||
import tensorflow as tf
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("gpt2")
|
||||
model = TFAutoModelWithLMHead.from_pretrained("gpt2")
|
||||
|
||||
sequence = f"Hugging Face is based in DUMBO, New York City, and is"
|
||||
generated = tokenizer.encode(sequence)
|
||||
|
||||
for i in range(50):
|
||||
predictions = model(tf.constant([generated]))[0]
|
||||
token = tf.argmax(predictions[0], axis=1)[-1].numpy()
|
||||
generated += [token]
|
||||
|
||||
resulting_string = tokenizer.decode(generated)
|
||||
print(resulting_string)
|
||||
|
||||
|
||||
This outputs a (hopefully) coherent string from the original sequence, as the
|
||||
:func:`~transformers.PreTrainedModel.generate` samples from a top_p/tok_k distribution:
|
||||
|
||||
::
|
||||
|
||||
Hugging Face is based in DUMBO, New York City, and is a live-action TV series based on the novel by John
|
||||
Carpenter, and its producers, David Kustlin and Steve Pichar. The film is directed by!
|
||||
|
||||
|
||||
Named Entity Recognition
|
||||
----------------------------------------------------
|
||||
|
||||
Named Entity Recognition (NER) is the task of classifying tokens according to a class, for example identifying a
|
||||
token as a person, an organisation or a location.
|
||||
An example of a named entity recognition dataset is the CoNLL-2003 dataset, which is entirely based on that task.
|
||||
If you would like to fine-tune a model on an NER task, you may leverage the `ner/run_ner.py` (PyTorch),
|
||||
`ner/run_pl_ner.py` (leveraging pytorch-lightning) or the `ner/run_tf_ner.py` (TensorFlow) scripts.
|
||||
|
||||
Here is an example using the pipelines do to named entity recognition, trying to identify tokens as belonging to one
|
||||
of 9 classes:
|
||||
|
||||
- O, Outside of a named entity
|
||||
- B-MIS, Beginning of a miscellaneous entity right after another miscellaneous entity
|
||||
- I-MIS, Miscellaneous entity
|
||||
- B-PER, Beginning of a person's name right after another person's name
|
||||
- I-PER, Person's name
|
||||
- B-ORG, Beginning of an organisation right after another organisation
|
||||
- I-ORG, Organisation
|
||||
- B-LOC, Beginning of a location right after another location
|
||||
- I-LOC, Location
|
||||
|
||||
It leverages a fine-tuned model on CoNLL-2003, fine-tuned by `@stefan-it <https://github.com/stefan-it>`__ from
|
||||
`dbmdz <https://github.com/dbmdz>`__.
|
||||
|
||||
::
|
||||
|
||||
from transformers import pipeline
|
||||
|
||||
nlp = pipeline("ner")
|
||||
|
||||
sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very" \
|
||||
"close to the Manhattan Bridge which is visible from the window."
|
||||
|
||||
print(nlp(sequence))
|
||||
|
||||
This outputs a list of all words that have been identified as an entity from the 9 classes defined above. Here is the
|
||||
expected results:
|
||||
|
||||
::
|
||||
|
||||
[
|
||||
{'word': 'Hu', 'score': 0.9995632767677307, 'entity': 'I-ORG'},
|
||||
{'word': '##gging', 'score': 0.9915938973426819, 'entity': 'I-ORG'},
|
||||
{'word': 'Face', 'score': 0.9982671737670898, 'entity': 'I-ORG'},
|
||||
{'word': 'Inc', 'score': 0.9994403719902039, 'entity': 'I-ORG'},
|
||||
{'word': 'New', 'score': 0.9994346499443054, 'entity': 'I-LOC'},
|
||||
{'word': 'York', 'score': 0.9993270635604858, 'entity': 'I-LOC'},
|
||||
{'word': 'City', 'score': 0.9993864893913269, 'entity': 'I-LOC'},
|
||||
{'word': 'D', 'score': 0.9825621843338013, 'entity': 'I-LOC'},
|
||||
{'word': '##UM', 'score': 0.936983048915863, 'entity': 'I-LOC'},
|
||||
{'word': '##BO', 'score': 0.8987102508544922, 'entity': 'I-LOC'},
|
||||
{'word': 'Manhattan', 'score': 0.9758241176605225, 'entity': 'I-LOC'},
|
||||
{'word': 'Bridge', 'score': 0.990249514579773, 'entity': 'I-LOC'}
|
||||
]
|
||||
|
||||
Note how the words "Hugging Face" have been identified as an organisation, and "New York City", "DUMBO" and
|
||||
"Manhattan Bridge" have been identified as locations.
|
||||
|
||||
Here is an example doing named entity recognition using a model and a tokenizer. The process is the following:
|
||||
|
||||
- Instantiate a tokenizer and a model from the checkpoint name. The model is identified as a BERT model and
|
||||
loads it with the weights stored in the checkpoint.
|
||||
- Define the label list with which the model was trained on.
|
||||
- Define a sequence with known entities, such as "Hugging Face" as an organisation and "New York City" as a location.
|
||||
- Split words into tokens so that they can be mapped to the predictions. We use a small hack by firstly completely
|
||||
encoding and decoding the sequence, so that we're left with a string that contains the special tokens.
|
||||
- Encode that sequence into IDs (special tokens are added automatically).
|
||||
- Retrieve the predictions by passing the input to the model and getting the first output. This results in a
|
||||
distribution over the 9 possible classes for each token. We take the argmax to retrieve the most likely class
|
||||
for each token.
|
||||
- Zip together each token with its prediction and print it.
|
||||
|
||||
::
|
||||
|
||||
## PYTORCH CODE
|
||||
from transformers import AutoModelForTokenClassification, AutoTokenizer
|
||||
import torch
|
||||
|
||||
model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
|
||||
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
|
||||
|
||||
label_list = [
|
||||
"O", # Outside of a named entity
|
||||
"B-MISC", # Beginning of a miscellaneous entity right after another miscellaneous entity
|
||||
"I-MISC", # Miscellaneous entity
|
||||
"B-PER", # Beginning of a person's name right after another person's name
|
||||
"I-PER", # Person's name
|
||||
"B-ORG", # Beginning of an organisation right after another organisation
|
||||
"I-ORG", # Organisation
|
||||
"B-LOC", # Beginning of a location right after another location
|
||||
"I-LOC" # Location
|
||||
]
|
||||
|
||||
sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very" \
|
||||
"close to the Manhattan Bridge."
|
||||
|
||||
# Bit of a hack to get the tokens with the special tokens
|
||||
tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))
|
||||
inputs = tokenizer.encode(sequence, return_tensors="pt")
|
||||
|
||||
outputs = model(inputs)[0]
|
||||
predictions = torch.argmax(outputs, dim=2)
|
||||
|
||||
print([(token, label_list[prediction]) for token, prediction in zip(tokens, predictions[0].tolist())])
|
||||
## TENSORFLOW CODE
|
||||
from transformers import TFAutoModelForTokenClassification, AutoTokenizer
|
||||
import tensorflow as tf
|
||||
|
||||
model = TFAutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
|
||||
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
|
||||
|
||||
label_list = [
|
||||
"O", # Outside of a named entity
|
||||
"B-MISC", # Beginning of a miscellaneous entity right after another miscellaneous entity
|
||||
"I-MISC", # Miscellaneous entity
|
||||
"B-PER", # Beginning of a person's name right after another person's name
|
||||
"I-PER", # Person's name
|
||||
"B-ORG", # Beginning of an organisation right after another organisation
|
||||
"I-ORG", # Organisation
|
||||
"B-LOC", # Beginning of a location right after another location
|
||||
"I-LOC" # Location
|
||||
]
|
||||
|
||||
sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very" \
|
||||
"close to the Manhattan Bridge."
|
||||
|
||||
# Bit of a hack to get the tokens with the special tokens
|
||||
tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))
|
||||
inputs = tokenizer.encode(sequence, return_tensors="tf")
|
||||
|
||||
outputs = model(inputs)[0]
|
||||
predictions = tf.argmax(outputs, axis=2)
|
||||
|
||||
print([(token, label_list[prediction]) for token, prediction in zip(tokens, predictions[0].numpy())])
|
||||
|
||||
This outputs a list of each token mapped to their prediction. Differently from the pipeline, here every token has
|
||||
a prediction as we didn't remove the "0" class which means that no particular entity was found on that token. The
|
||||
following array should be the output:
|
||||
|
||||
::
|
||||
|
||||
[('[CLS]', 'O'), ('Hu', 'I-ORG'), ('##gging', 'I-ORG'), ('Face', 'I-ORG'), ('Inc', 'I-ORG'), ('.', 'O'), ('is', 'O'), ('a', 'O'), ('company', 'O'), ('based', 'O'), ('in', 'O'), ('New', 'I-LOC'), ('York', 'I-LOC'), ('City', 'I-LOC'), ('.', 'O'), ('Its', 'O'), ('headquarters', 'O'), ('are', 'O'), ('in', 'O'), ('D', 'I-LOC'), ('##UM', 'I-LOC'), ('##BO', 'I-LOC'), (',', 'O'), ('therefore', 'O'), ('very', 'O'), ('##c', 'O'), ('##lose', 'O'), ('to', 'O'), ('the', 'O'), ('Manhattan', 'I-LOC'), ('Bridge', 'I-LOC'), ('.', 'O'), ('[SEP]', 'O')]
|
||||
Loading…
Reference in a new issue