diff --git a/docs/source/_static/css/huggingface.css b/docs/source/_static/css/huggingface.css
index 4adf8f753..808f8005f 100644
--- a/docs/source/_static/css/huggingface.css
+++ b/docs/source/_static/css/huggingface.css
@@ -1,3 +1,25 @@
+/* Our DOM objects */
+
+.framework-selector {
+ display: flex;
+ flex-direction: row;
+ justify-content: flex-end;
+}
+
+.framework-selector > button {
+ background-color: white;
+ color: #6670FF;
+ border: 1px solid #6670FF;
+ padding: 5px;
+}
+
+.framework-selector > button.selected{
+ background-color: #6670FF;
+ color: white;
+ border: 1px solid #6670FF;
+ padding: 5px;
+}
+
/* The literal code blocks */
.rst-content tt.literal, .rst-content tt.literal, .rst-content code.literal {
color: #6670FF;
diff --git a/docs/source/_static/js/custom.js b/docs/source/_static/js/custom.js
index 04cdfc1de..ac9388531 100644
--- a/docs/source/_static/js/custom.js
+++ b/docs/source/_static/js/custom.js
@@ -68,6 +68,74 @@ function addHfMenu() {
document.body.insertAdjacentHTML('afterbegin', div);
}
+function platformToggle() {
+ const codeBlocks = Array.from(document.getElementsByClassName("highlight"));
+ const pytorchIdentifier = "## PYTORCH CODE";
+ const tensorflowIdentifier = "## TENSORFLOW CODE";
+ const pytorchSpanIdentifier = `${pytorchIdentifier}`;
+ const tensorflowSpanIdentifier = `${tensorflowIdentifier}`;
+
+ const getFrameworkSpans = filteredCodeBlock => {
+ const spans = filteredCodeBlock.element.innerHTML;
+ const pytorchSpanPosition = spans.indexOf(pytorchSpanIdentifier);
+ const tensorflowSpanPosition = spans.indexOf(tensorflowSpanIdentifier);
+
+ let pytorchSpans;
+ let tensorflowSpans;
+
+ if(pytorchSpanPosition < tensorflowSpanPosition){
+ pytorchSpans = spans.slice(pytorchSpanPosition + pytorchSpanIdentifier.length + 1, tensorflowSpanPosition);
+ tensorflowSpans = spans.slice(tensorflowSpanPosition + tensorflowSpanIdentifier.length + 1, spans.length);
+ }else{
+ tensorflowSpans = spans.slice(tensorflowSpanPosition + tensorflowSpanIdentifier.length + 1, pytorchSpanPosition);
+ pytorchSpans = spans.slice(pytorchSpanPosition + pytorchSpanIdentifier.length + 1, spans.length);
+ }
+
+ return {
+ ...filteredCodeBlock,
+ pytorchSample: pytorchSpans ,
+ tensorflowSample: tensorflowSpans
+ }
+ };
+
+ const createFrameworkButtons = sample => {
+ const pytorchButton = document.createElement("button");
+ pytorchButton.innerText = "PyTorch";
+
+ const tensorflowButton = document.createElement("button");
+ tensorflowButton.innerText = "TensorFlow";
+
+ const selectorDiv = document.createElement("div");
+ selectorDiv.classList.add("framework-selector");
+ selectorDiv.appendChild(pytorchButton);
+ selectorDiv.appendChild(tensorflowButton);
+ sample.element.parentElement.prepend(selectorDiv);
+
+ // Init on PyTorch
+ sample.element.innerHTML = sample.pytorchSample;
+ pytorchButton.classList.add("selected");
+ tensorflowButton.classList.remove("selected");
+
+ pytorchButton.addEventListener("click", () => {
+ sample.element.innerHTML = sample.pytorchSample;
+ pytorchButton.classList.add("selected");
+ tensorflowButton.classList.remove("selected");
+ });
+ tensorflowButton.addEventListener("click", () => {
+ sample.element.innerHTML = sample.tensorflowSample;
+ tensorflowButton.classList.add("selected");
+ pytorchButton.classList.remove("selected");
+ });
+ };
+
+ codeBlocks
+ .map(element => {return {element: element.firstChild, innerText: element.innerText}})
+ .filter(codeBlock => codeBlock.innerText.includes(pytorchIdentifier) && codeBlock.innerText.includes(tensorflowIdentifier))
+ .map(getFrameworkSpans)
+ .forEach(createFrameworkButtons);
+}
+
+
/*!
* github-buttons v2.2.10
* (c) 2019 なつき
@@ -85,6 +153,7 @@ function onLoad() {
addGithubButton();
parseGithubButtons();
addHfMenu();
+ platformToggle();
}
window.addEventListener("load", onLoad);
diff --git a/docs/source/_static/js/huggingface_logo.svg b/docs/source/_static/js/huggingface_logo.svg
index 84974866c..79a9e5d8a 100644
--- a/docs/source/_static/js/huggingface_logo.svg
+++ b/docs/source/_static/js/huggingface_logo.svg
@@ -1,47 +1 @@
-
\ No newline at end of file
+
\ No newline at end of file
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 18d52e9f2..763b3ac70 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -20,7 +20,7 @@ sys.path.insert(0, os.path.abspath('../../src'))
# -- Project information -----------------------------------------------------
project = u'transformers'
-copyright = u'2019, huggingface'
+copyright = u'2020, huggingface'
author = u'huggingface'
# The short X.Y version
@@ -105,6 +105,12 @@ html_static_path = ['_static']
#
# html_sidebars = {}
+# This must be the name of an image file (path relative to the configuration
+# directory) that is the favicon of the docs. Modern browsers use this as
+# the icon for tabs, windows and bookmarks. It should be a Windows-style
+# icon file (.ico).
+html_favicon = 'favicon.ico'
+
# -- Options for HTMLHelp output ---------------------------------------------
diff --git a/docs/source/favicon.ico b/docs/source/favicon.ico
new file mode 100644
index 000000000..424101de7
Binary files /dev/null and b/docs/source/favicon.ico differ
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 215e6cba6..5c593eacf 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -61,6 +61,7 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train
quickstart
glossary
pretrained_models
+ usage
model_sharing
examples
notebooks
diff --git a/docs/source/usage.rst b/docs/source/usage.rst
new file mode 100644
index 000000000..8fb7a4472
--- /dev/null
+++ b/docs/source/usage.rst
@@ -0,0 +1,597 @@
+Usage
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This page shows the most frequent use-cases when using the library. The models available allow for many different
+configurations and a great versatility in use-cases. The most simple ones are presented here, showcasing usage
+for tasks such as question answering, sequence classification, named entity recognition and others.
+
+These examples leverage auto-models, which are classes that will instantiate a model according to a given checkpoint,
+automatically selecting the correct model architecture. Please check the :class:`~transformers.AutoModel` documentation
+for more information.
+Feel free to modify the code to be more specific and adapt it to your specific use-case.
+
+In order for a model to perform well on a task, it must be loaded from a checkpoint corresponding to that task. These
+checkpoints are usually pre-trained on a large corpus of data and fine-tuned on a specific task. This means the
+following:
+
+- Not all models were fine-tuned on all tasks. If you want to fine-tune a model on a specific task, you can leverage
+ one of the `run_$TASK.py` script in the
+ `examples `_ directory.
+- Fine-tuned models were fine-tuned on a specific dataset. This dataset may or may not overlap with your use-case
+ and domain. As mentioned previously, you may leverage the
+ `examples `_ scripts to fine-tune your model, or you
+ may create your own training script.
+
+In order to do an inference on a task, several mechanisms are made available by the library:
+
+- Pipelines: very easy-to-use abstractions, which require as little as two lines of code.
+- Using a model directly with a tokenizer (PyTorch/TensorFlow): the full inference using the model. Less abstraction,
+ but much more powerful.
+
+Both approaches are showcased here.
+
+.. note::
+
+ All tasks presented here leverage pre-trained checkpoints that were fine-tuned on specific tasks. Loading a
+ checkpoint that was not fine-tuned on a specific task would load only the base transformer layers and not the
+ additional head that is used for the task, initializing the weights of that head randomly.
+
+ This would produce random output.
+
+Sequence Classification
+--------------------------
+
+Sequence classification is the task of classifying sequences according to a given number of classes. An example
+of sequence classification is the GLUE dataset, which is entirely based on that task. If you would like to fine-tune
+a model on a GLUE sequence classification task, you may leverage the
+`run_glue.py `_ or
+`run_tf_glue.py `_ scripts.
+
+Here is an example using the pipelines do to sentiment analysis: identifying if a sequence is positive or negative.
+It leverages a fine-tuned model on sst2, which is a GLUE task.
+
+::
+
+ from transformers import pipeline
+
+ nlp = pipeline("sentiment-analysis")
+
+ print(nlp("I hate you"))
+ print(nlp("I love you"))
+
+This returns a label ("POSITIVE" or "NEGATIVE") alongside a score, as follows:
+
+::
+
+ [{'label': 'NEGATIVE', 'score': 0.9991129}]
+ [{'label': 'POSITIVE', 'score': 0.99986565}]
+
+
+Here is an example of doing a sequence classification using a model to determine if two sequences are paraphrases
+of each other. The process is the following:
+
+- Instantiate a tokenizer and a model from the checkpoint name. The model is identified as a BERT model and loads it
+ with the weights stored in the checkpoint.
+- Build a sequence from the two sentences, with the correct model-specific separators token type ids
+ and attention masks (:func:`~transformers.PreTrainedTokenizer.encode` and
+ :func:`~transformers.PreTrainedTokenizer.encode_plus` take care of this)
+- Pass this sequence through the model so that it is classified in one of the two available classes: 0
+ (not a paraphrase) and 1 (is a paraphrase)
+- Compute the softmax of the result to get probabilities over the classes
+- Print the results
+
+::
+
+ ## PYTORCH CODE
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
+ import torch
+
+ tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
+ model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc")
+
+ classes = ["not paraphrase", "is paraphrase"]
+
+ sequence_0 = "The company HuggingFace is based in New York City"
+ sequence_1 = "Apples are especially bad for your health"
+ sequence_2 = "HuggingFace's headquarters are situated in Manhattan"
+
+ paraphrase = tokenizer.encode_plus(sequence_0, sequence_2, return_tensors="pt")
+ not_paraphrase = tokenizer.encode_plus(sequence_0, sequence_1, return_tensors="pt")
+
+ paraphrase_classification_logits = model(**paraphrase)[0]
+ not_paraphrase_classification_logits = model(**not_paraphrase)[0]
+
+ paraphrase_results = torch.softmax(paraphrase_classification_logits, dim=1).tolist()[0]
+ not_paraphrase_results = torch.softmax(not_paraphrase_classification_logits, dim=1).tolist()[0]
+
+ print("Should be paraphrase")
+ for i in range(len(classes)):
+ print(f"{classes[i]}: {round(paraphrase_results[i] * 100)}%")
+
+ print("\nShould not be paraphrase")
+ for i in range(len(classes)):
+ print(f"{classes[i]}: {round(not_paraphrase_results[i] * 100)}%")
+ ## TENSORFLOW CODE
+ from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
+ import tensorflow as tf
+
+ tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
+ model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc")
+
+ classes = ["not paraphrase", "is paraphrase"]
+
+ sequence_0 = "The company HuggingFace is based in New York City"
+ sequence_1 = "Apples are especially bad for your health"
+ sequence_2 = "HuggingFace's headquarters are situated in Manhattan"
+
+ paraphrase = tokenizer.encode_plus(sequence_0, sequence_2, return_tensors="tf")
+ not_paraphrase = tokenizer.encode_plus(sequence_0, sequence_1, return_tensors="tf")
+
+ paraphrase_classification_logits = model(paraphrase)[0]
+ not_paraphrase_classification_logits = model(not_paraphrase)[0]
+
+ paraphrase_results = tf.nn.softmax(paraphrase_classification_logits, axis=1).numpy()[0]
+ not_paraphrase_results = tf.nn.softmax(not_paraphrase_classification_logits, axis=1).numpy()[0]
+
+ print("Should be paraphrase")
+ for i in range(len(classes)):
+ print(f"{classes[i]}: {round(paraphrase_results[i] * 100)}%")
+
+ print("\nShould not be paraphrase")
+ for i in range(len(classes)):
+ print(f"{classes[i]}: {round(not_paraphrase_results[i] * 100)}%")
+
+This outputs the following results:
+
+::
+
+ Should be paraphrase
+ not paraphrase: 10%
+ is paraphrase: 90%
+
+ Should not be paraphrase
+ not paraphrase: 94%
+ is paraphrase: 6%
+
+Extractive Question Answering
+----------------------------------------------------
+
+Extractive Question Answering is the task of extracting an answer from a text given a question. An example of a
+question answering dataset is the SQuAD dataset, which is entirely based on that task. If you would like to fine-tune
+a model on a SQuAD task, you may leverage the `run_squad.py`.
+
+Here is an example using the pipelines do to question answering: extracting an answer from a text given a question.
+It leverages a fine-tuned model on SQuAD.
+
+::
+
+ from transformers import pipeline
+
+ nlp = pipeline("question-answering")
+
+ context = r"""
+ Extractive Question Answering is the task of extracting an answer from a text given a question. An example of a
+ question answering dataset is the SQuAD dataset, which is entirely based on that task. If you would like to fine-tune
+ a model on a SQuAD task, you may leverage the `run_squad.py`.
+ """
+
+ print(nlp(question="What is extractive question answering?", context=context))
+ print(nlp(question="What is a good example of a question answering dataset?", context=context))
+
+This returns an answer extracted from the text, a confidence score, alongside "start" and "end" values which
+are the positions of the extracted answer in the text.
+
+::
+
+ {'score': 0.622232091629833, 'start': 34, 'end': 96, 'answer': 'the task of extracting an answer from a text given a question.'}
+ {'score': 0.5115299158662765, 'start': 147, 'end': 161, 'answer': 'SQuAD dataset,'}
+
+
+Here is an example of question answering using a model and a tokenizer. The process is the following:
+
+- Instantiate a tokenizer and a model from the checkpoint name. The model is identified as a BERT model and loads it
+ with the weights stored in the checkpoint.
+- Define a text and a few questions.
+- Iterate over the questions and build a sequence from the text and the current question, with the correct
+ model-specific separators token type ids and attention masks
+- Pass this sequence through the model. This outputs a range of scores across the entire sequence tokens (question and
+ text), for both the start and end positions.
+- Compute the softmax of the result to get probabilities over the tokens
+- Fetch the tokens from the identified start and stop values, convert those tokens to a string.
+- Print the results
+
+::
+
+ ## PYTORCH CODE
+ from transformers import AutoTokenizer, AutoModelForQuestionAnswering
+ import torch
+
+ tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
+ model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
+
+ text = r"""
+ 🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
+ architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
+ Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
+ TensorFlow 2.0 and PyTorch.
+ """
+
+ questions = [
+ "How many pretrained models are available in Transformers?",
+ "What does Transformers provide?",
+ "Transformers provides interoperability between which frameworks?",
+ ]
+
+ for question in questions:
+ inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="pt")
+ input_ids = inputs["input_ids"].tolist()[0]
+
+ text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
+ answer_start_scores, answer_end_scores = model(**inputs)
+
+ answer_start = torch.argmax(
+ answer_start_scores
+ ) # Get the most likely beginning of answer with the argmax of the score
+ answer_end = torch.argmax(answer_end_scores) + 1 # Get the most likely end of answer with the argmax of the score
+
+ answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
+
+ print(f"Question: {question}")
+ print(f"Answer: {answer}\n")
+ ## TENSORFLOW CODE
+ from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering
+ import tensorflow as tf
+
+ tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
+ model = TFAutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
+
+ text = r"""
+ 🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
+ architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
+ Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
+ TensorFlow 2.0 and PyTorch.
+ """
+
+ questions = [
+ "How many pretrained models are available in Transformers?",
+ "What does Transformers provide?",
+ "Transformers provides interoperability between which frameworks?",
+ ]
+
+ for question in questions:
+ inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="tf")
+ input_ids = inputs["input_ids"].numpy()[0]
+
+ text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
+ answer_start_scores, answer_end_scores = model(inputs)
+
+ answer_start = tf.argmax(
+ answer_start_scores, axis=1
+ ).numpy()[0] # Get the most likely beginning of answer with the argmax of the score
+ answer_end = (
+ tf.argmax(answer_end_scores, axis=1) + 1
+ ).numpy()[0] # Get the most likely end of answer with the argmax of the score
+ answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
+
+ print(f"Question: {question}")
+ print(f"Answer: {answer}\n")
+
+This outputs the questions followed by the predicted answers:
+
+::
+
+ Question: How many pretrained models are available in Transformers?
+ Answer: over 32 +
+
+ Question: What does Transformers provide?
+ Answer: general - purpose architectures
+
+ Question: Transformers provides interoperability between which frameworks?
+ Answer: tensorflow 2 . 0 and pytorch
+
+
+
+Language Modeling
+----------------------------------------------------
+
+Language modeling is the task of fitting a model to a corpus, which can be domain specific. All popular transformer
+based models are trained using a variant of language modeling, e.g. BERT with masked language modeling, GPT-2 with
+causal language modeling.
+
+Language modeling can be useful outside of pre-training as well, for example to shift the model distribution to be
+domain-specific: using a language model trained over a very large corpus, and then fine-tuning it to a news dataset
+or on scientific papers e.g. `LysandreJik/arxiv-nlp `__.
+
+Masked Language Modeling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Masked language modeling is the task of masking tokens in a sequence with a masking token, and prompting the model to
+fill that mask with an appropriate token. This allows the model to attend to both the right context (tokens on the
+right of the mask) and the left context (tokens on the left of the mask). Such a training creates a strong basis
+for downstream tasks requiring bi-directional context such as SQuAD (question answering,
+see `Lewis, Lui, Goyal et al. `__, part 4.2).
+
+Here is an example of using pipelines to replace a mask from a sequence:
+
+::
+
+ from transformers import pipeline
+
+ nlp = pipeline("fill-mask")
+ print(nlp(f"HuggingFace is creating a {nlp.tokenizer.mask_token} that the community uses to solve NLP tasks."))
+
+This outputs the sequences with the mask filled, the confidence score as well as the token id in the tokenizer
+vocabulary:
+
+::
+
+ [
+ {'sequence': ' HuggingFace is creating a tool that the community uses to solve NLP tasks.', 'score': 0.15627853572368622, 'token': 3944},
+ {'sequence': ' HuggingFace is creating a framework that the community uses to solve NLP tasks.', 'score': 0.11690319329500198, 'token': 7208},
+ {'sequence': ' HuggingFace is creating a library that the community uses to solve NLP tasks.', 'score': 0.058063216507434845, 'token': 5560},
+ {'sequence': ' HuggingFace is creating a database that the community uses to solve NLP tasks.', 'score': 0.04211743175983429, 'token': 8503},
+ {'sequence': ' HuggingFace is creating a prototype that the community uses to solve NLP tasks.', 'score': 0.024718601256608963, 'token': 17715}
+ ]
+
+Here is an example doing masked language modeling using a model and a tokenizer. The process is the following:
+
+- Instantiate a tokenizer and a model from the checkpoint name. The model is identified as a DistilBERT model and
+ loads it with the weights stored in the checkpoint.
+- Define a sequence with a masked token, placing the :obj:`tokenizer.mask_token` instead of a word.
+- Encode that sequence into IDs and find the position of the masked token in that list of IDs.
+- Retrieve the predictions at the index of the mask token: this tensor has the same size as the vocabulary, and the
+ values are the scores attributed to each token. The model gives higher score to tokens he deems probable in that
+ context.
+- Retrieve the top 5 tokens using the PyTorch :obj:`topk` or TensorFlow :obj:`top_k` methods.
+- Replace the mask token by the tokens and print the results
+
+::
+
+ ## PYTORCH CODE
+ from transformers import AutoModelWithLMHead, AutoTokenizer
+ import torch
+
+ tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
+ model = AutoModelWithLMHead.from_pretrained("distilbert-base-cased")
+
+ sequence = f"Distilled models are smaller than the models they mimic. Using them instead of the large versions would help {tokenizer.mask_token} our carbon footprint."
+
+ input = tokenizer.encode(sequence, return_tensors="pt")
+ mask_token_index = torch.where(input == tokenizer.mask_token_id)[1]
+
+ token_logits = model(input)[0]
+ mask_token_logits = token_logits[0, mask_token_index, :]
+
+ top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
+
+ for token in top_5_tokens:
+ print(sequence.replace(tokenizer.mask_token, tokenizer.decode([token])))
+ ## TENSORFLOW CODE
+ from transformers import TFAutoModelWithLMHead, AutoTokenizer
+ import tensorflow as tf
+
+ tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
+ model = TFAutoModelWithLMHead.from_pretrained("distilbert-base-cased")
+
+ sequence = f"Distilled models are smaller than the models they mimic. Using them instead of the large versions would help {tokenizer.mask_token} our carbon footprint."
+
+ input = tokenizer.encode(sequence, return_tensors="tf")
+ mask_token_index = tf.where(input == tokenizer.mask_token_id)[0, 1]
+
+ token_logits = model(input)[0]
+ mask_token_logits = token_logits[0, mask_token_index, :]
+
+ top_5_tokens = tf.math.top_k(mask_token_logits, 5).indices.numpy()
+
+ for token in top_5_tokens:
+ print(sequence.replace(tokenizer.mask_token, tokenizer.decode([token])))
+
+This prints five sequences, with the top 5 tokens predicted by the model:
+
+::
+
+ Distilled models are smaller than the models they mimic. Using them instead of the large versions would help reduce our carbon footprint.
+ Distilled models are smaller than the models they mimic. Using them instead of the large versions would help increase our carbon footprint.
+ Distilled models are smaller than the models they mimic. Using them instead of the large versions would help decrease our carbon footprint.
+ Distilled models are smaller than the models they mimic. Using them instead of the large versions would help offset our carbon footprint.
+ Distilled models are smaller than the models they mimic. Using them instead of the large versions would help improve our carbon footprint.
+
+
+Causal Language Modeling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Causal language modeling is the task of predicting the token following a sequence of tokens. In this situation, the
+model only attends to the left context (tokens on the left of the mask). Such a training is particularly interesting
+for generation tasks.
+
+There is currently no pipeline to do causal language modeling/generation.
+
+Here is an example using the tokenizer and model. leveraging the :func:`~transformers.PreTrainedModel.generate` method
+to generate the tokens following the initial sequence in PyTorch, and creating a simple loop in TensorFlow.
+
+::
+
+ ## PYTORCH CODE
+ from transformers import AutoModelWithLMHead, AutoTokenizer
+
+ tokenizer = AutoTokenizer.from_pretrained("gpt2")
+ model = AutoModelWithLMHead.from_pretrained("gpt2")
+
+ sequence = f"Hugging Face is based in DUMBO, New York City, and is"
+
+ input = tokenizer.encode(sequence, return_tensors="pt")
+ generated = model.generate(input, max_length=50)
+
+ resulting_string = tokenizer.decode(generated.tolist()[0])
+ print(resulting_string)
+ ## TENSORFLOW CODE
+ from transformers import TFAutoModelWithLMHead, AutoTokenizer
+ import tensorflow as tf
+
+ tokenizer = AutoTokenizer.from_pretrained("gpt2")
+ model = TFAutoModelWithLMHead.from_pretrained("gpt2")
+
+ sequence = f"Hugging Face is based in DUMBO, New York City, and is"
+ generated = tokenizer.encode(sequence)
+
+ for i in range(50):
+ predictions = model(tf.constant([generated]))[0]
+ token = tf.argmax(predictions[0], axis=1)[-1].numpy()
+ generated += [token]
+
+ resulting_string = tokenizer.decode(generated)
+ print(resulting_string)
+
+
+This outputs a (hopefully) coherent string from the original sequence, as the
+:func:`~transformers.PreTrainedModel.generate` samples from a top_p/tok_k distribution:
+
+::
+
+ Hugging Face is based in DUMBO, New York City, and is a live-action TV series based on the novel by John
+ Carpenter, and its producers, David Kustlin and Steve Pichar. The film is directed by!
+
+
+Named Entity Recognition
+----------------------------------------------------
+
+Named Entity Recognition (NER) is the task of classifying tokens according to a class, for example identifying a
+token as a person, an organisation or a location.
+An example of a named entity recognition dataset is the CoNLL-2003 dataset, which is entirely based on that task.
+If you would like to fine-tune a model on an NER task, you may leverage the `ner/run_ner.py` (PyTorch),
+`ner/run_pl_ner.py` (leveraging pytorch-lightning) or the `ner/run_tf_ner.py` (TensorFlow) scripts.
+
+Here is an example using the pipelines do to named entity recognition, trying to identify tokens as belonging to one
+of 9 classes:
+
+- O, Outside of a named entity
+- B-MIS, Beginning of a miscellaneous entity right after another miscellaneous entity
+- I-MIS, Miscellaneous entity
+- B-PER, Beginning of a person's name right after another person's name
+- I-PER, Person's name
+- B-ORG, Beginning of an organisation right after another organisation
+- I-ORG, Organisation
+- B-LOC, Beginning of a location right after another location
+- I-LOC, Location
+
+It leverages a fine-tuned model on CoNLL-2003, fine-tuned by `@stefan-it `__ from
+`dbmdz `__.
+
+::
+
+ from transformers import pipeline
+
+ nlp = pipeline("ner")
+
+ sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very" \
+ "close to the Manhattan Bridge which is visible from the window."
+
+ print(nlp(sequence))
+
+This outputs a list of all words that have been identified as an entity from the 9 classes defined above. Here is the
+expected results:
+
+::
+
+ [
+ {'word': 'Hu', 'score': 0.9995632767677307, 'entity': 'I-ORG'},
+ {'word': '##gging', 'score': 0.9915938973426819, 'entity': 'I-ORG'},
+ {'word': 'Face', 'score': 0.9982671737670898, 'entity': 'I-ORG'},
+ {'word': 'Inc', 'score': 0.9994403719902039, 'entity': 'I-ORG'},
+ {'word': 'New', 'score': 0.9994346499443054, 'entity': 'I-LOC'},
+ {'word': 'York', 'score': 0.9993270635604858, 'entity': 'I-LOC'},
+ {'word': 'City', 'score': 0.9993864893913269, 'entity': 'I-LOC'},
+ {'word': 'D', 'score': 0.9825621843338013, 'entity': 'I-LOC'},
+ {'word': '##UM', 'score': 0.936983048915863, 'entity': 'I-LOC'},
+ {'word': '##BO', 'score': 0.8987102508544922, 'entity': 'I-LOC'},
+ {'word': 'Manhattan', 'score': 0.9758241176605225, 'entity': 'I-LOC'},
+ {'word': 'Bridge', 'score': 0.990249514579773, 'entity': 'I-LOC'}
+ ]
+
+Note how the words "Hugging Face" have been identified as an organisation, and "New York City", "DUMBO" and
+"Manhattan Bridge" have been identified as locations.
+
+Here is an example doing named entity recognition using a model and a tokenizer. The process is the following:
+
+- Instantiate a tokenizer and a model from the checkpoint name. The model is identified as a BERT model and
+ loads it with the weights stored in the checkpoint.
+- Define the label list with which the model was trained on.
+- Define a sequence with known entities, such as "Hugging Face" as an organisation and "New York City" as a location.
+- Split words into tokens so that they can be mapped to the predictions. We use a small hack by firstly completely
+ encoding and decoding the sequence, so that we're left with a string that contains the special tokens.
+- Encode that sequence into IDs (special tokens are added automatically).
+- Retrieve the predictions by passing the input to the model and getting the first output. This results in a
+ distribution over the 9 possible classes for each token. We take the argmax to retrieve the most likely class
+ for each token.
+- Zip together each token with its prediction and print it.
+
+::
+
+ ## PYTORCH CODE
+ from transformers import AutoModelForTokenClassification, AutoTokenizer
+ import torch
+
+ model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
+ tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+
+ label_list = [
+ "O", # Outside of a named entity
+ "B-MISC", # Beginning of a miscellaneous entity right after another miscellaneous entity
+ "I-MISC", # Miscellaneous entity
+ "B-PER", # Beginning of a person's name right after another person's name
+ "I-PER", # Person's name
+ "B-ORG", # Beginning of an organisation right after another organisation
+ "I-ORG", # Organisation
+ "B-LOC", # Beginning of a location right after another location
+ "I-LOC" # Location
+ ]
+
+ sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very" \
+ "close to the Manhattan Bridge."
+
+ # Bit of a hack to get the tokens with the special tokens
+ tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))
+ inputs = tokenizer.encode(sequence, return_tensors="pt")
+
+ outputs = model(inputs)[0]
+ predictions = torch.argmax(outputs, dim=2)
+
+ print([(token, label_list[prediction]) for token, prediction in zip(tokens, predictions[0].tolist())])
+ ## TENSORFLOW CODE
+ from transformers import TFAutoModelForTokenClassification, AutoTokenizer
+ import tensorflow as tf
+
+ model = TFAutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
+ tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+
+ label_list = [
+ "O", # Outside of a named entity
+ "B-MISC", # Beginning of a miscellaneous entity right after another miscellaneous entity
+ "I-MISC", # Miscellaneous entity
+ "B-PER", # Beginning of a person's name right after another person's name
+ "I-PER", # Person's name
+ "B-ORG", # Beginning of an organisation right after another organisation
+ "I-ORG", # Organisation
+ "B-LOC", # Beginning of a location right after another location
+ "I-LOC" # Location
+ ]
+
+ sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very" \
+ "close to the Manhattan Bridge."
+
+ # Bit of a hack to get the tokens with the special tokens
+ tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))
+ inputs = tokenizer.encode(sequence, return_tensors="tf")
+
+ outputs = model(inputs)[0]
+ predictions = tf.argmax(outputs, axis=2)
+
+ print([(token, label_list[prediction]) for token, prediction in zip(tokens, predictions[0].numpy())])
+
+This outputs a list of each token mapped to their prediction. Differently from the pipeline, here every token has
+a prediction as we didn't remove the "0" class which means that no particular entity was found on that token. The
+following array should be the output:
+
+::
+
+ [('[CLS]', 'O'), ('Hu', 'I-ORG'), ('##gging', 'I-ORG'), ('Face', 'I-ORG'), ('Inc', 'I-ORG'), ('.', 'O'), ('is', 'O'), ('a', 'O'), ('company', 'O'), ('based', 'O'), ('in', 'O'), ('New', 'I-LOC'), ('York', 'I-LOC'), ('City', 'I-LOC'), ('.', 'O'), ('Its', 'O'), ('headquarters', 'O'), ('are', 'O'), ('in', 'O'), ('D', 'I-LOC'), ('##UM', 'I-LOC'), ('##BO', 'I-LOC'), (',', 'O'), ('therefore', 'O'), ('very', 'O'), ('##c', 'O'), ('##lose', 'O'), ('to', 'O'), ('the', 'O'), ('Manhattan', 'I-LOC'), ('Bridge', 'I-LOC'), ('.', 'O'), ('[SEP]', 'O')]