Adding usage examples for common tasks (#2850)

* Usage: Sequence Classification & Question Answering * Pipeline example * Language modeling * TensorFlow code for Sequence classification * Custom TF/PT toggler in docs * QA + LM for TensorFlow * Finish Usage for both PyTorch and TensorFlow * Addressing Julien's comments * More assertive * cleanup * Favicon - added favicon option in conf.py along with the favicon image - udpated 🤗 logo. slightly smaller and should appear more consistent across editing programs (no more tongue on the outside of the mouth) Co-authored-by: joshchagani <joshua@joshuachagani.com>
2026-05-14 20:58:08 +00:00 · 2020-02-25 13:48:24 -05:00 · 2020-02-25 13:48:24 -05:00 · 65e7c90a77
commit 65e7c90a77
parent e693cd1e87
7 changed files with 697 additions and 48 deletions
--- a/docs/source/_static/css/huggingface.css
+++ b/docs/source/_static/css/huggingface.css
@ -1,3 +1,25 @@
+/* Our DOM objects */
+
+.framework-selector {
+    display: flex;
+    flex-direction: row;
+    justify-content: flex-end;
+}
+
+.framework-selector > button {
+    background-color: white;
+    color: #6670FF;
+    border: 1px solid #6670FF;
+    padding: 5px;
+}
+
+.framework-selector > button.selected{
+    background-color: #6670FF;
+    color: white;
+    border: 1px solid #6670FF;
+    padding: 5px;
+}
+
 /* The literal code blocks */
 .rst-content tt.literal, .rst-content tt.literal, .rst-content code.literal {
    color: #6670FF;
--- a/docs/source/_static/js/custom.js
+++ b/docs/source/_static/js/custom.js
@ -68,6 +68,74 @@ function addHfMenu() {
    document.body.insertAdjacentHTML('afterbegin', div);
 }

+function platformToggle() {
+    const codeBlocks = Array.from(document.getElementsByClassName("highlight"));
+    const pytorchIdentifier = "## PYTORCH CODE";
+    const tensorflowIdentifier = "## TENSORFLOW CODE";
+    const pytorchSpanIdentifier = `<span class="c1">${pytorchIdentifier}</span>`;
+    const tensorflowSpanIdentifier = `<span class="c1">${tensorflowIdentifier}</span>`;
+
+    const getFrameworkSpans = filteredCodeBlock => {
+        const spans = filteredCodeBlock.element.innerHTML;
+        const pytorchSpanPosition = spans.indexOf(pytorchSpanIdentifier);
+        const tensorflowSpanPosition = spans.indexOf(tensorflowSpanIdentifier);
+
+        let pytorchSpans;
+        let tensorflowSpans;
+
+        if(pytorchSpanPosition < tensorflowSpanPosition){
+            pytorchSpans = spans.slice(pytorchSpanPosition + pytorchSpanIdentifier.length + 1, tensorflowSpanPosition);
+            tensorflowSpans = spans.slice(tensorflowSpanPosition + tensorflowSpanIdentifier.length + 1, spans.length);
+        }else{
+            tensorflowSpans = spans.slice(tensorflowSpanPosition + tensorflowSpanIdentifier.length + 1, pytorchSpanPosition);
+            pytorchSpans = spans.slice(pytorchSpanPosition + pytorchSpanIdentifier.length + 1, spans.length);
+        }
+
+        return {
+            ...filteredCodeBlock,
+            pytorchSample: pytorchSpans ,
+            tensorflowSample: tensorflowSpans
+        }
+    };
+
+    const createFrameworkButtons = sample => {
+            const pytorchButton = document.createElement("button");
+            pytorchButton.innerText = "PyTorch";
+
+            const tensorflowButton = document.createElement("button");
+            tensorflowButton.innerText = "TensorFlow";
+
+            const selectorDiv = document.createElement("div");
+            selectorDiv.classList.add("framework-selector");
+            selectorDiv.appendChild(pytorchButton);
+            selectorDiv.appendChild(tensorflowButton);
+            sample.element.parentElement.prepend(selectorDiv);
+
+            // Init on PyTorch
+            sample.element.innerHTML = sample.pytorchSample;
+            pytorchButton.classList.add("selected");
+            tensorflowButton.classList.remove("selected");
+
+            pytorchButton.addEventListener("click", () => {
+                sample.element.innerHTML = sample.pytorchSample;
+                pytorchButton.classList.add("selected");
+                tensorflowButton.classList.remove("selected");
+            });
+            tensorflowButton.addEventListener("click", () => {
+               sample.element.innerHTML = sample.tensorflowSample;
+                tensorflowButton.classList.add("selected");
+                pytorchButton.classList.remove("selected");
+            });
+        };
+
+    codeBlocks
+        .map(element => {return {element: element.firstChild, innerText: element.innerText}})
+        .filter(codeBlock => codeBlock.innerText.includes(pytorchIdentifier) && codeBlock.innerText.includes(tensorflowIdentifier))
+        .map(getFrameworkSpans)
+        .forEach(createFrameworkButtons);
+}
+
+
 /*!
 * github-buttons v2.2.10
 * (c) 2019 なつき
@ -85,6 +153,7 @@ function onLoad() {
    addGithubButton();
    parseGithubButtons();
    addHfMenu();
+    platformToggle();
 }

 window.addEventListener("load", onLoad);
--- a/docs/source/_static/js/huggingface_logo.svg
+++ b/docs/source/_static/js/huggingface_logo.svg
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -20,7 +20,7 @@ sys.path.insert(0, os.path.abspath('../../src'))
 # -- Project information -----------------------------------------------------

 project = u'transformers'
-copyright = u'2019, huggingface'
+copyright = u'2020, huggingface'
 author = u'huggingface'

 # The short X.Y version
@ -105,6 +105,12 @@ html_static_path = ['_static']
 #
 # html_sidebars = {}

+# This must be the name of an image file (path relative to the configuration 
+# directory) that is the favicon of the docs. Modern browsers use this as 
+# the icon for tabs, windows and bookmarks. It should be a Windows-style 
+# icon file (.ico).
+html_favicon = 'favicon.ico'
+

 # -- Options for HTMLHelp output ---------------------------------------------

--- a/docs/source/favicon.ico
+++ b/docs/source/favicon.ico
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -61,6 +61,7 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train
    quickstart
    glossary
    pretrained_models
+    usage
    model_sharing
    examples
    notebooks
--- a/docs/source/usage.rst
+++ b/docs/source/usage.rst
@ -0,0 +1,597 @@
+Usage
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This page shows the most frequent use-cases when using the library. The models available allow for many different
+configurations and a great versatility in use-cases. The most simple ones are presented here, showcasing usage
+for tasks such as question answering, sequence classification, named entity recognition and others.
+
+These examples leverage auto-models, which are classes that will instantiate a model according to a given checkpoint,
+automatically selecting the correct model architecture. Please check the :class:`~transformers.AutoModel` documentation
+for more information.
+Feel free to modify the code to be more specific and adapt it to your specific use-case.
+
+In order for a model to perform well on a task, it must be loaded from a checkpoint corresponding to that task. These
+checkpoints are usually pre-trained on a large corpus of data and fine-tuned on a specific task. This means the
+following:
+
+- Not all models were fine-tuned on all tasks. If you want to fine-tune a model on a specific task, you can leverage
+  one of the `run_$TASK.py` script in the
+  `examples <https://github.com/huggingface/transformers/tree/master/examples>`_ directory.
+- Fine-tuned models were fine-tuned on a specific dataset. This dataset may or may not overlap with your use-case
+  and domain. As mentioned previously, you may leverage the
+  `examples <https://github.com/huggingface/transformers/tree/master/examples>`_ scripts to fine-tune your model, or you
+  may create your own training script.
+
+In order to do an inference on a task, several mechanisms are made available by the library:
+
+- Pipelines: very easy-to-use abstractions, which require as little as two lines of code.
+- Using a model directly with a tokenizer (PyTorch/TensorFlow): the full inference using the model. Less abstraction,
+  but much more powerful.
+
+Both approaches are showcased here.
+
+.. note::
+
+    All tasks presented here leverage pre-trained checkpoints that were fine-tuned on specific tasks. Loading a
+    checkpoint that was not fine-tuned on a specific task would load only the base transformer layers and not the
+    additional head that is used for the task, initializing the weights of that head randomly.
+
+    This would produce random output.
+
+Sequence Classification
+--------------------------
+
+Sequence classification is the task of classifying sequences according to a given number of classes. An example
+of sequence classification is the GLUE dataset, which is entirely based on that task. If you would like to fine-tune
+a model on a GLUE sequence classification task, you may leverage the
+`run_glue.py <https://github.com/huggingface/transformers/tree/master/examples/run_glue.py>`_ or
+`run_tf_glue.py <https://github.com/huggingface/transformers/tree/master/examples/run_tf_glue.py>`_ scripts.
+
+Here is an example using the pipelines do to sentiment analysis: identifying if a sequence is positive or negative.
+It leverages a fine-tuned model on sst2, which is a GLUE task.
+
+::
+
+    from transformers import pipeline
+
+    nlp = pipeline("sentiment-analysis")
+
+    print(nlp("I hate you"))
+    print(nlp("I love you"))
+
+This returns a label ("POSITIVE" or "NEGATIVE") alongside a score, as follows:
+
+::
+
+    [{'label': 'NEGATIVE', 'score': 0.9991129}]
+    [{'label': 'POSITIVE', 'score': 0.99986565}]
+
+
+Here is an example of doing a sequence classification using a model to determine if two sequences are paraphrases
+of each other. The process is the following:
+
+- Instantiate a tokenizer and a model from the checkpoint name. The model is identified as a BERT model and loads it
+  with the weights stored in the checkpoint.
+- Build a sequence from the two sentences, with the correct model-specific separators token type ids
+  and attention masks (:func:`~transformers.PreTrainedTokenizer.encode` and
+  :func:`~transformers.PreTrainedTokenizer.encode_plus` take care of this)
+- Pass this sequence through the model so that it is classified in one of the two available classes: 0
+  (not a paraphrase) and 1 (is a paraphrase)
+- Compute the softmax of the result to get probabilities over the classes
+- Print the results
+
+::
+
+    ## PYTORCH CODE
+    from transformers import AutoTokenizer, AutoModelForSequenceClassification
+    import torch
+
+    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
+    model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc")
+
+    classes = ["not paraphrase", "is paraphrase"]
+
+    sequence_0 = "The company HuggingFace is based in New York City"
+    sequence_1 = "Apples are especially bad for your health"
+    sequence_2 = "HuggingFace's headquarters are situated in Manhattan"
+
+    paraphrase = tokenizer.encode_plus(sequence_0, sequence_2, return_tensors="pt")
+    not_paraphrase = tokenizer.encode_plus(sequence_0, sequence_1, return_tensors="pt")
+
+    paraphrase_classification_logits = model(**paraphrase)[0]
+    not_paraphrase_classification_logits = model(**not_paraphrase)[0]
+
+    paraphrase_results = torch.softmax(paraphrase_classification_logits, dim=1).tolist()[0]
+    not_paraphrase_results = torch.softmax(not_paraphrase_classification_logits, dim=1).tolist()[0]
+
+    print("Should be paraphrase")
+    for i in range(len(classes)):
+        print(f"{classes[i]}: {round(paraphrase_results[i] * 100)}%")
+
+    print("\nShould not be paraphrase")
+    for i in range(len(classes)):
+        print(f"{classes[i]}: {round(not_paraphrase_results[i] * 100)}%")
+    ## TENSORFLOW CODE
+    from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
+    import tensorflow as tf
+
+    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
+    model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc")
+
+    classes = ["not paraphrase", "is paraphrase"]
+
+    sequence_0 = "The company HuggingFace is based in New York City"
+    sequence_1 = "Apples are especially bad for your health"
+    sequence_2 = "HuggingFace's headquarters are situated in Manhattan"
+
+    paraphrase = tokenizer.encode_plus(sequence_0, sequence_2, return_tensors="tf")
+    not_paraphrase = tokenizer.encode_plus(sequence_0, sequence_1, return_tensors="tf")
+
+    paraphrase_classification_logits = model(paraphrase)[0]
+    not_paraphrase_classification_logits = model(not_paraphrase)[0]
+
+    paraphrase_results = tf.nn.softmax(paraphrase_classification_logits, axis=1).numpy()[0]
+    not_paraphrase_results = tf.nn.softmax(not_paraphrase_classification_logits, axis=1).numpy()[0]
+
+    print("Should be paraphrase")
+    for i in range(len(classes)):
+        print(f"{classes[i]}: {round(paraphrase_results[i] * 100)}%")
+
+    print("\nShould not be paraphrase")
+    for i in range(len(classes)):
+        print(f"{classes[i]}: {round(not_paraphrase_results[i] * 100)}%")
+
+This outputs the following results:
+
+::
+
+    Should be paraphrase
+    not paraphrase: 10%
+    is paraphrase: 90%
+
+    Should not be paraphrase
+    not paraphrase: 94%
+    is paraphrase: 6%
+
+Extractive Question Answering
+----------------------------------------------------
+
+Extractive Question Answering is the task of extracting an answer from a text given a question. An example of a
+question answering dataset is the SQuAD dataset, which is entirely based on that task. If you would like to fine-tune
+a model on a SQuAD task, you may leverage the `run_squad.py`.
+
+Here is an example using the pipelines do to question answering: extracting an answer from a text given a question.
+It leverages a fine-tuned model on SQuAD.
+
+::
+
+    from transformers import pipeline
+
+    nlp = pipeline("question-answering")
+
+    context = r"""
+    Extractive Question Answering is the task of extracting an answer from a text given a question. An example of a
+    question answering dataset is the SQuAD dataset, which is entirely based on that task. If you would like to fine-tune
+    a model on a SQuAD task, you may leverage the `run_squad.py`.
+    """
+
+    print(nlp(question="What is extractive question answering?", context=context))
+    print(nlp(question="What is a good example of a question answering dataset?", context=context))
+
+This returns an answer extracted from the text, a confidence score, alongside "start" and "end" values which
+are the positions of the extracted answer in the text.
+
+::
+
+    {'score': 0.622232091629833, 'start': 34, 'end': 96, 'answer': 'the task of extracting an answer from a text given a question.'}
+    {'score': 0.5115299158662765, 'start': 147, 'end': 161, 'answer': 'SQuAD dataset,'}
+
+
+Here is an example of question answering using a model and a tokenizer. The process is the following:
+
+- Instantiate a tokenizer and a model from the checkpoint name. The model is identified as a BERT model and loads it
+  with the weights stored in the checkpoint.
+- Define a text and a few questions.
+- Iterate over the questions and build a sequence from the text and the current question, with the correct
+  model-specific separators token type ids and attention masks
+- Pass this sequence through the model. This outputs a range of scores across the entire sequence tokens (question and
+  text), for both the start and end positions.
+- Compute the softmax of the result to get probabilities over the tokens
+- Fetch the tokens from the identified start and stop values, convert those tokens to a string.
+- Print the results
+
+::
+
+    ## PYTORCH CODE
+    from transformers import AutoTokenizer, AutoModelForQuestionAnswering
+    import torch
+
+    tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
+    model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
+
+    text = r"""
+    🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
+    architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
+    Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
+    TensorFlow 2.0 and PyTorch.
+    """
+
+    questions = [
+        "How many pretrained models are available in Transformers?",
+        "What does Transformers provide?",
+        "Transformers provides interoperability between which frameworks?",
+    ]
+
+    for question in questions:
+        inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="pt")
+        input_ids = inputs["input_ids"].tolist()[0]
+
+        text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
+        answer_start_scores, answer_end_scores = model(**inputs)
+
+        answer_start = torch.argmax(
+            answer_start_scores
+        )  # Get the most likely beginning of answer with the argmax of the score
+        answer_end = torch.argmax(answer_end_scores) + 1  # Get the most likely end of answer with the argmax of the score
+
+        answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
+
+        print(f"Question: {question}")
+        print(f"Answer: {answer}\n")
+    ## TENSORFLOW CODE
+    from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering
+    import tensorflow as tf
+
+    tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
+    model = TFAutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
+
+    text = r"""
+    🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
+    architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
+    Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
+    TensorFlow 2.0 and PyTorch.
+    """
+
+    questions = [
+        "How many pretrained models are available in Transformers?",
+        "What does Transformers provide?",
+        "Transformers provides interoperability between which frameworks?",
+    ]
+
+    for question in questions:
+        inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="tf")
+        input_ids = inputs["input_ids"].numpy()[0]
+
+        text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
+        answer_start_scores, answer_end_scores = model(inputs)
+
+        answer_start = tf.argmax(
+            answer_start_scores, axis=1
+        ).numpy()[0]  # Get the most likely beginning of answer with the argmax of the score
+        answer_end = (
+            tf.argmax(answer_end_scores, axis=1) + 1
+        ).numpy()[0]  # Get the most likely end of answer with the argmax of the score
+        answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
+
+        print(f"Question: {question}")
+        print(f"Answer: {answer}\n")
+
+This outputs the questions followed by the predicted answers:
+
+::
+
+    Question: How many pretrained models are available in Transformers?
+    Answer: over 32 +
+
+    Question: What does Transformers provide?
+    Answer: general - purpose architectures
+
+    Question: Transformers provides interoperability between which frameworks?
+    Answer: tensorflow 2 . 0 and pytorch
+
+
+
+Language Modeling
+----------------------------------------------------
+
+Language modeling is the task of fitting a model to a corpus, which can be domain specific. All popular transformer
+based models are trained using a variant of language modeling, e.g. BERT with masked language modeling, GPT-2 with
+causal language modeling.
+
+Language modeling can be useful outside of pre-training as well, for example to shift the model distribution to be
+domain-specific: using a language model trained over a very large corpus, and then fine-tuning it to a news dataset
+or on scientific papers e.g. `LysandreJik/arxiv-nlp <https://huggingface.co/lysandre/arxiv-nlp>`__.
+
+Masked Language Modeling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Masked language modeling is the task of masking tokens in a sequence with a masking token, and prompting the model to
+fill that mask with an appropriate token. This allows the model to attend to both the right context (tokens on the
+right of the mask) and the left context (tokens on the left of the mask). Such a training creates a strong basis
+for downstream tasks requiring bi-directional context such as SQuAD (question answering,
+see `Lewis, Lui, Goyal et al. <https://arxiv.org/abs/1910.13461>`__, part 4.2).
+
+Here is an example of using pipelines to replace a mask from a sequence:
+
+::
+
+    from transformers import pipeline
+
+    nlp = pipeline("fill-mask")
+    print(nlp(f"HuggingFace is creating a {nlp.tokenizer.mask_token} that the community uses to solve NLP tasks."))
+
+This outputs the sequences with the mask filled, the confidence score as well as the token id in the tokenizer
+vocabulary:
+
+::
+
+    [
+        {'sequence': '<s> HuggingFace is creating a tool that the community uses to solve NLP tasks.</s>', 'score': 0.15627853572368622, 'token': 3944},
+        {'sequence': '<s> HuggingFace is creating a framework that the community uses to solve NLP tasks.</s>', 'score': 0.11690319329500198, 'token': 7208},
+        {'sequence': '<s> HuggingFace is creating a library that the community uses to solve NLP tasks.</s>', 'score': 0.058063216507434845, 'token': 5560},
+        {'sequence': '<s> HuggingFace is creating a database that the community uses to solve NLP tasks.</s>', 'score': 0.04211743175983429, 'token': 8503},
+        {'sequence': '<s> HuggingFace is creating a prototype that the community uses to solve NLP tasks.</s>', 'score': 0.024718601256608963, 'token': 17715}
+    ]
+
+Here is an example doing masked language modeling using a model and a tokenizer. The process is the following:
+
+- Instantiate a tokenizer and a model from the checkpoint name. The model is identified as a DistilBERT model and
+  loads it with the weights stored in the checkpoint.
+- Define a sequence with a masked token, placing the :obj:`tokenizer.mask_token` instead of a word.
+- Encode that sequence into IDs and find the position of the masked token in that list of IDs.
+- Retrieve the predictions at the index of the mask token: this tensor has the same size as the vocabulary, and the
+  values are the scores attributed to each token. The model gives higher score to tokens he deems probable in that
+  context.
+- Retrieve the top 5 tokens using the PyTorch :obj:`topk` or TensorFlow :obj:`top_k` methods.
+- Replace the mask token by the tokens and print the results
+
+::
+
+    ## PYTORCH CODE
+    from transformers import AutoModelWithLMHead, AutoTokenizer
+    import torch
+
+    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
+    model = AutoModelWithLMHead.from_pretrained("distilbert-base-cased")
+
+    sequence = f"Distilled models are smaller than the models they mimic. Using them instead of the large versions would help {tokenizer.mask_token} our carbon footprint."
+
+    input = tokenizer.encode(sequence, return_tensors="pt")
+    mask_token_index = torch.where(input == tokenizer.mask_token_id)[1]
+
+    token_logits = model(input)[0]
+    mask_token_logits = token_logits[0, mask_token_index, :]
+
+    top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
+
+    for token in top_5_tokens:
+        print(sequence.replace(tokenizer.mask_token, tokenizer.decode([token])))
+    ## TENSORFLOW CODE
+    from transformers import TFAutoModelWithLMHead, AutoTokenizer
+    import tensorflow as tf
+
+    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
+    model = TFAutoModelWithLMHead.from_pretrained("distilbert-base-cased")
+
+    sequence = f"Distilled models are smaller than the models they mimic. Using them instead of the large versions would help {tokenizer.mask_token} our carbon footprint."
+
+    input = tokenizer.encode(sequence, return_tensors="tf")
+    mask_token_index = tf.where(input == tokenizer.mask_token_id)[0, 1]
+
+    token_logits = model(input)[0]
+    mask_token_logits = token_logits[0, mask_token_index, :]
+
+    top_5_tokens = tf.math.top_k(mask_token_logits, 5).indices.numpy()
+
+    for token in top_5_tokens:
+        print(sequence.replace(tokenizer.mask_token, tokenizer.decode([token])))
+
+This prints five sequences, with the top 5 tokens predicted by the model:
+
+::
+
+    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help reduce our carbon footprint.
+    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help increase our carbon footprint.
+    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help decrease our carbon footprint.
+    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help offset our carbon footprint.
+    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help improve our carbon footprint.
+
+
+Causal Language Modeling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Causal language modeling is the task of predicting the token following a sequence of tokens. In this situation, the
+model only attends to the left context (tokens on the left of the mask). Such a training is particularly interesting
+for generation tasks.
+
+There is currently no pipeline to do causal language modeling/generation.
+
+Here is an example using the tokenizer and model. leveraging the :func:`~transformers.PreTrainedModel.generate` method
+to generate the tokens following the initial sequence in PyTorch, and creating a simple loop in TensorFlow.
+
+::
+
+    ## PYTORCH CODE
+    from transformers import AutoModelWithLMHead, AutoTokenizer
+
+    tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    model = AutoModelWithLMHead.from_pretrained("gpt2")
+
+    sequence = f"Hugging Face is based in DUMBO, New York City, and is"
+
+    input = tokenizer.encode(sequence, return_tensors="pt")
+    generated = model.generate(input, max_length=50)
+
+    resulting_string = tokenizer.decode(generated.tolist()[0])
+    print(resulting_string)
+    ## TENSORFLOW CODE
+    from transformers import TFAutoModelWithLMHead, AutoTokenizer
+    import tensorflow as tf
+
+    tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    model = TFAutoModelWithLMHead.from_pretrained("gpt2")
+
+    sequence = f"Hugging Face is based in DUMBO, New York City, and is"
+    generated = tokenizer.encode(sequence)
+
+    for i in range(50):
+        predictions = model(tf.constant([generated]))[0]
+        token = tf.argmax(predictions[0], axis=1)[-1].numpy()
+        generated += [token]
+
+    resulting_string = tokenizer.decode(generated)
+    print(resulting_string)
+
+
+This outputs a (hopefully) coherent string from the original sequence, as the
+:func:`~transformers.PreTrainedModel.generate` samples from a top_p/tok_k distribution:
+
+::
+
+    Hugging Face is based in DUMBO, New York City, and is a live-action TV series based on the novel by John
+    Carpenter, and its producers, David Kustlin and Steve Pichar. The film is directed by!
+
+
+Named Entity Recognition
+----------------------------------------------------
+
+Named Entity Recognition (NER) is the task of classifying tokens according to a class, for example identifying a
+token as a person, an organisation or a location.
+An example of a named entity recognition dataset is the CoNLL-2003 dataset, which is entirely based on that task.
+If you would like to fine-tune a model on an NER task, you may leverage the `ner/run_ner.py` (PyTorch),
+`ner/run_pl_ner.py` (leveraging pytorch-lightning) or the `ner/run_tf_ner.py` (TensorFlow) scripts.
+
+Here is an example using the pipelines do to named entity recognition, trying to identify tokens as belonging to one
+of 9 classes:
+
+- O, Outside of a named entity
+- B-MIS, Beginning of a miscellaneous entity right after another miscellaneous entity
+- I-MIS, Miscellaneous entity
+- B-PER, Beginning of a person's name right after another person's name
+- I-PER, Person's name
+- B-ORG, Beginning of an organisation right after another organisation
+- I-ORG, Organisation
+- B-LOC, Beginning of a location right after another location
+- I-LOC, Location
+
+It leverages a fine-tuned model on CoNLL-2003, fine-tuned by `@stefan-it <https://github.com/stefan-it>`__ from
+`dbmdz <https://github.com/dbmdz>`__.
+
+::
+
+    from transformers import pipeline
+
+    nlp = pipeline("ner")
+
+    sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very" \
+               "close to the Manhattan Bridge which is visible from the window."
+
+    print(nlp(sequence))
+
+This outputs a list of all words that have been identified as an entity from the 9 classes defined above. Here is the
+expected results:
+
+::
+
+    [
+        {'word': 'Hu', 'score': 0.9995632767677307, 'entity': 'I-ORG'},
+        {'word': '##gging', 'score': 0.9915938973426819, 'entity': 'I-ORG'},
+        {'word': 'Face', 'score': 0.9982671737670898, 'entity': 'I-ORG'},
+        {'word': 'Inc', 'score': 0.9994403719902039, 'entity': 'I-ORG'},
+        {'word': 'New', 'score': 0.9994346499443054, 'entity': 'I-LOC'},
+        {'word': 'York', 'score': 0.9993270635604858, 'entity': 'I-LOC'},
+        {'word': 'City', 'score': 0.9993864893913269, 'entity': 'I-LOC'},
+        {'word': 'D', 'score': 0.9825621843338013, 'entity': 'I-LOC'},
+        {'word': '##UM', 'score': 0.936983048915863, 'entity': 'I-LOC'},
+        {'word': '##BO', 'score': 0.8987102508544922, 'entity': 'I-LOC'},
+        {'word': 'Manhattan', 'score': 0.9758241176605225, 'entity': 'I-LOC'},
+        {'word': 'Bridge', 'score': 0.990249514579773, 'entity': 'I-LOC'}
+    ]
+
+Note how the words "Hugging Face" have been identified as an organisation, and "New York City", "DUMBO" and
+"Manhattan Bridge" have been identified as locations.
+
+Here is an example doing named entity recognition using a model and a tokenizer. The process is the following:
+
+- Instantiate a tokenizer and a model from the checkpoint name. The model is identified as a BERT model and
+  loads it with the weights stored in the checkpoint.
+- Define the label list with which the model was trained on.
+- Define a sequence with known entities, such as "Hugging Face" as an organisation and "New York City" as a location.
+- Split words into tokens so that they can be mapped to the predictions. We use a small hack by firstly completely
+  encoding and decoding the sequence, so that we're left with a string that contains the special tokens.
+- Encode that sequence into IDs (special tokens are added automatically).
+- Retrieve the predictions by passing the input to the model and getting the first output. This results in a
+  distribution over the 9 possible classes for each token. We take the argmax to retrieve the most likely class
+  for each token.
+- Zip together each token with its prediction and print it.
+
+::
+
+    ## PYTORCH CODE
+    from transformers import AutoModelForTokenClassification, AutoTokenizer
+    import torch
+
+    model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
+    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+
+    label_list = [
+        "O",       # Outside of a named entity
+        "B-MISC",  # Beginning of a miscellaneous entity right after another miscellaneous entity
+        "I-MISC",  # Miscellaneous entity
+        "B-PER",   # Beginning of a person's name right after another person's name
+        "I-PER",   # Person's name
+        "B-ORG",   # Beginning of an organisation right after another organisation
+        "I-ORG",   # Organisation
+        "B-LOC",   # Beginning of a location right after another location
+        "I-LOC"    # Location
+    ]
+
+    sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very" \
+               "close to the Manhattan Bridge."
+
+    # Bit of a hack to get the tokens with the special tokens
+    tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))
+    inputs = tokenizer.encode(sequence, return_tensors="pt")
+
+    outputs = model(inputs)[0]
+    predictions = torch.argmax(outputs, dim=2)
+
+    print([(token, label_list[prediction]) for token, prediction in zip(tokens, predictions[0].tolist())])
+    ## TENSORFLOW CODE
+    from transformers import TFAutoModelForTokenClassification, AutoTokenizer
+    import tensorflow as tf
+
+    model = TFAutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
+    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+
+    label_list = [
+        "O",       # Outside of a named entity
+        "B-MISC",  # Beginning of a miscellaneous entity right after another miscellaneous entity
+        "I-MISC",  # Miscellaneous entity
+        "B-PER",   # Beginning of a person's name right after another person's name
+        "I-PER",   # Person's name
+        "B-ORG",   # Beginning of an organisation right after another organisation
+        "I-ORG",   # Organisation
+        "B-LOC",   # Beginning of a location right after another location
+        "I-LOC"    # Location
+    ]
+
+    sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very" \
+               "close to the Manhattan Bridge."
+
+    # Bit of a hack to get the tokens with the special tokens
+    tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))
+    inputs = tokenizer.encode(sequence, return_tensors="tf")
+
+    outputs = model(inputs)[0]
+    predictions = tf.argmax(outputs, axis=2)
+
+    print([(token, label_list[prediction]) for token, prediction in zip(tokens, predictions[0].numpy())])
+
+This outputs a list of each token mapped to their prediction. Differently from the pipeline, here every token has
+a prediction as we didn't remove the "0" class which means that no particular entity was found on that token. The
+following array should be the output:
+
+::
+
+    [('[CLS]', 'O'), ('Hu', 'I-ORG'), ('##gging', 'I-ORG'), ('Face', 'I-ORG'), ('Inc', 'I-ORG'), ('.', 'O'), ('is', 'O'), ('a', 'O'), ('company', 'O'), ('based', 'O'), ('in', 'O'), ('New', 'I-LOC'), ('York', 'I-LOC'), ('City', 'I-LOC'), ('.', 'O'), ('Its', 'O'), ('headquarters', 'O'), ('are', 'O'), ('in', 'O'), ('D', 'I-LOC'), ('##UM', 'I-LOC'), ('##BO', 'I-LOC'), (',', 'O'), ('therefore', 'O'), ('very', 'O'), ('##c', 'O'), ('##lose', 'O'), ('to', 'O'), ('the', 'O'), ('Manhattan', 'I-LOC'), ('Bridge', 'I-LOC'), ('.', 'O'), ('[SEP]', 'O')]