From 7fad617dc1fc681a7f5da5e0172c8b83f4bf0024 Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Fri, 10 Jul 2020 17:31:02 -0400
Subject: [PATCH] Document model outputs (#5673)

* Document model outputs

* Update docs/source/main_classes/output.rst

Co-authored-by: Lysandre Debut <lysandre@huggingface.co>

Co-authored-by: Lysandre Debut <lysandre@huggingface.co>
---
 docs/source/index.rst                   |   1 +
 docs/source/main_classes/output.rst     | 141 ++++++++++++++++++++++++
 docs/source/model_doc/albert.rst        |   7 ++
 docs/source/model_doc/bert.rst          |   7 ++
 docs/source/model_doc/dpr.rst           |  13 +++
 docs/source/model_doc/electra.rst       |   7 ++
 docs/source/model_doc/gpt.rst           |   7 ++
 docs/source/model_doc/gpt2.rst          |   7 ++
 docs/source/model_doc/mobilebert.rst    |   7 ++
 docs/source/model_doc/transformerxl.rst |  10 ++
 docs/source/model_doc/xlm.rst           |   8 ++
 docs/source/model_doc/xlnet.rst         |  25 +++++
 src/transformers/file_utils.py          |  19 +++-
 src/transformers/modeling_albert.py     |   4 +-
 src/transformers/modeling_bert.py       |   4 +-
 src/transformers/modeling_electra.py    |   4 +-
 src/transformers/modeling_mobilebert.py |   4 +-
 src/transformers/modeling_outputs.py    |   9 +-
 18 files changed, 267 insertions(+), 17 deletions(-)
 create mode 100644 docs/source/main_classes/output.rst

diff --git a/docs/source/index.rst b/docs/source/index.rst
index a84ccd0a4..bcc46a01d 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -173,6 +173,7 @@ conversion utilities for the following models:
     :caption: Package Reference
 
     main_classes/configuration
+    main_classes/output
     main_classes/model
     main_classes/tokenizer
     main_classes/pipelines
diff --git a/docs/source/main_classes/output.rst b/docs/source/main_classes/output.rst
new file mode 100644
index 000000000..fe43c8e59
--- /dev/null
+++ b/docs/source/main_classes/output.rst
@@ -0,0 +1,141 @@
+Model outputs
+-------------
+
+PyTorch models have outputs that are instances of subclasses of :class:`~transformers.file_utils.ModelOutput`. Those
+are data structures containing all the information returned by the model, but that can also be used as tuples or
+dictionaries.
+
+Let's see of this looks on an example:
+
+.. code-block::
+
+    from transformers import BertTokenizer, BertForSequenceClassification
+    import torch
+
+    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+    model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
+
+    inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+    labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+    outputs = model(**inputs, labels=labels)
+
+The ``outputs`` object is a :class:`~transformers.modeling_outputs.SequenceClassifierOutput`, as we can see in the
+documentation of that class below, it means it has an optional ``loss``, a ``logits`` an optional ``hidden_states`` and
+an optional ``attentions`` attribute. Here we have the ``loss`` since we passed along ``labels``, but we don't have
+``hidden_states`` and ``attentions`` because we didn't pass ``output_hidden_states=True`` or
+``output_attentions=True``.
+
+You can access each attribute as you would usually do, and if that attribute has not been returned by the model, you
+will get ``None``. Here for instance ``outputs.loss`` is the loss computed by the model, and ``outputs.attentions`` is
+``None``.
+
+When considering our ``outputs`` object as tuple, it only considers the attributes that don't have ``None`` values.
+Here for instance, it has two elements, ``loss`` then ``logits``, so
+
+.. code-block::
+
+    outputs[:2]
+
+will return the tuple ``(outputs.loss, outputs.logits)`` for instance.
+
+When considering our ``outputs`` object as dictionary, it only considers the attributes that don't have ``None``
+values. Here for instance, it has two keys that are ``loss`` and ``logits``.
+
+We document here the generic model outputs that are used by more than one model type. Specific output types are
+documented on their corresponding model page.
+
+``ModelOutput``
+~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.file_utils.ModelOutput
+    :members:
+
+``BaseModelOutput``
+~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.BaseModelOutput
+    :members:
+
+``BaseModelOutputWithPooling``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.BaseModelOutputWithPooling
+    :members:
+
+``BaseModelOutputWithPast``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.BaseModelOutputWithPast
+    :members:
+
+``Seq2SeqModelOutput``
+~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.Seq2SeqModelOutput
+    :members:
+
+``CausalLMOutput``
+~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.CausalLMOutput
+    :members:
+
+``CausalLMOutputWithPast``
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.CausalLMOutputWithPast
+    :members:
+
+``MaskedLMOutput``
+~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.MaskedLMOutput
+    :members:
+
+``Seq2SeqLMOutput``
+~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.Seq2SeqLMOutput
+    :members:
+
+``NextSentencePredictorOutput``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.NextSentencePredictorOutput
+    :members:
+
+``SequenceClassifierOutput``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.SequenceClassifierOutput
+    :members:
+
+``Seq2SeqSequenceClassifierOutput``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.Seq2SeqSequenceClassifierOutput
+    :members:
+
+``MultipleChoiceModelOutput``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.MultipleChoiceModelOutput
+    :members:
+
+``TokenClassifierOutput``
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.TokenClassifierOutput
+    :members:
+
+``QuestionAnsweringModelOutput``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.QuestionAnsweringModelOutput
+    :members:
+
+``Seq2SeqQuestionAnsweringModelOutput``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.Seq2SeqQuestionAnsweringModelOutput
+    :members:
diff --git a/docs/source/model_doc/albert.rst b/docs/source/model_doc/albert.rst
index 8bd0c3e8f..ab382a27c 100644
--- a/docs/source/model_doc/albert.rst
+++ b/docs/source/model_doc/albert.rst
@@ -47,6 +47,13 @@ AlbertTokenizer
         create_token_type_ids_from_sequences, save_vocabulary
 
 
+Albert specific outputs
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_albert.AlbertForPretrainingOutput
+    :members:
+
+
 AlbertModel
 ~~~~~~~~~~~~~~~~~~~~
 
diff --git a/docs/source/model_doc/bert.rst b/docs/source/model_doc/bert.rst
index b77a241a8..cbc1c8aa7 100644
--- a/docs/source/model_doc/bert.rst
+++ b/docs/source/model_doc/bert.rst
@@ -59,6 +59,13 @@ BertTokenizerFast
     :members:
 
 
+Bert specific outputs
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_bert.BertForPretrainingOutput
+    :members:
+
+
 BertModel
 ~~~~~~~~~~~~~~~~~~~~
 
diff --git a/docs/source/model_doc/dpr.rst b/docs/source/model_doc/dpr.rst
index 84b0527c2..a77d3868b 100644
--- a/docs/source/model_doc/dpr.rst
+++ b/docs/source/model_doc/dpr.rst
@@ -69,6 +69,19 @@ DPRReaderTokenizerFast
     :members:
 
 
+DPR specific outputs
+~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_dpr.DPRContextEncoderOutput
+    :members:
+
+.. autoclass:: transformers.modeling_dpr.DPRQuestionEncoderOutput
+    :members:
+
+.. autoclass:: transformers.modeling_dpr.DPRReaderOutput
+    :members:
+
+
 DPRContextEncoder
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/docs/source/model_doc/electra.rst b/docs/source/model_doc/electra.rst
index 431b4f271..895ca9dde 100644
--- a/docs/source/model_doc/electra.rst
+++ b/docs/source/model_doc/electra.rst
@@ -71,6 +71,13 @@ ElectraTokenizerFast
     :members:
 
 
+Electra specific outputs
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_electra.ElectraForPretrainingOutput
+    :members:
+
+
 ElectraModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/docs/source/model_doc/gpt.rst b/docs/source/model_doc/gpt.rst
index 4c54dee70..39c5fe269 100644
--- a/docs/source/model_doc/gpt.rst
+++ b/docs/source/model_doc/gpt.rst
@@ -71,6 +71,13 @@ OpenAIGPTTokenizerFast
     :members:
 
 
+OpenAI specific outputs
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_openai.OpenAIGPTDoubleHeadsModelOutput
+    :members:
+
+
 OpenAIGPTModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/docs/source/model_doc/gpt2.rst b/docs/source/model_doc/gpt2.rst
index 45ac90ec2..3f1be1bb4 100644
--- a/docs/source/model_doc/gpt2.rst
+++ b/docs/source/model_doc/gpt2.rst
@@ -58,6 +58,13 @@ GPT2TokenizerFast
     :members:
 
 
+GPT2 specific outputs
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_gpt2.GPT2DoubleHeadsModelOutput
+    :members:
+
+
 GPT2Model
 ~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/docs/source/model_doc/mobilebert.rst b/docs/source/model_doc/mobilebert.rst
index caec4af1f..ad3e0c206 100644
--- a/docs/source/model_doc/mobilebert.rst
+++ b/docs/source/model_doc/mobilebert.rst
@@ -56,6 +56,13 @@ MobileBertTokenizerFast
     :members:
 
 
+MobileBert specific outputs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_mobilebert.MobileBertForPretrainingOutput
+    :members:
+
+
 MobileBertModel
 ~~~~~~~~~~~~~~~~~~~~
 
diff --git a/docs/source/model_doc/transformerxl.rst b/docs/source/model_doc/transformerxl.rst
index 336bfdcd6..dc1a63783 100644
--- a/docs/source/model_doc/transformerxl.rst
+++ b/docs/source/model_doc/transformerxl.rst
@@ -54,6 +54,16 @@ TransfoXLTokenizerFast
     :members:
 
 
+TransfoXL specific outputs
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_transfo_xl.TransfoXLModelOutput
+    :members:
+
+.. autoclass:: transformers.modeling_transfo_xl.TransfoXLLMHeadModelOutput
+    :members:
+
+
 TransfoXLModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/docs/source/model_doc/xlm.rst b/docs/source/model_doc/xlm.rst
index b043a1bec..cd14a77cb 100644
--- a/docs/source/model_doc/xlm.rst
+++ b/docs/source/model_doc/xlm.rst
@@ -46,6 +46,14 @@ XLMTokenizer
     :members: build_inputs_with_special_tokens, get_special_tokens_mask,
         create_token_type_ids_from_sequences, save_vocabulary
 
+
+XLM specific outputs
+~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_xlm.XLMForQuestionAnsweringOutput
+    :members:
+
+
 XLMModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/docs/source/model_doc/xlnet.rst b/docs/source/model_doc/xlnet.rst
index 79faab8d5..bea589759 100644
--- a/docs/source/model_doc/xlnet.rst
+++ b/docs/source/model_doc/xlnet.rst
@@ -50,6 +50,31 @@ XLNetTokenizer
         create_token_type_ids_from_sequences, save_vocabulary
 
 
+XLNet specific outputs
+~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_xlnet.XLNetModelOutput
+    :members:
+
+.. autoclass:: transformers.modeling_xlnet.XLNetLMHeadModelOutput
+    :members:
+
+.. autoclass:: transformers.modeling_xlnet.XLNetForSequenceClassificationOutput
+    :members:
+
+.. autoclass:: transformers.modeling_xlnet.XLNetForMultipleChoiceOutput
+    :members:
+
+.. autoclass:: transformers.modeling_xlnet.XLNetForTokenClassificationOutput
+    :members:
+
+.. autoclass:: transformers.modeling_xlnet.XLNetForQuestionAnsweringSimpleOutput
+    :members:
+
+.. autoclass:: transformers.modeling_xlnet.XLNetForQuestionAnsweringOutput
+    :members:
+
+
 XLNetModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
index e4bcec6c8..33688b767 100644
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -189,7 +189,7 @@ def add_end_docstrings(*docstr):
 
 RETURN_INTRODUCTION = r"""
     Returns:
-        :class:`~transformers.{output_type}` or :obj:`tuple(torch.FloatTensor)` (if ``return_tuple=True`` is passed or when ``config.return_tuple=True``) comprising various elements depending on the configuration (:class:`~transformers.{config_class}`) and inputs:
+        :class:`~{full_output_type}` or :obj:`tuple(torch.FloatTensor)` (if ``return_tuple=True`` is passed or when ``config.return_tuple=True``) comprising various elements depending on the configuration (:class:`~transformers.{config_class}`) and inputs:
 """
 
 
@@ -208,7 +208,8 @@ def _prepare_output_docstrings(output_type, config_class):
         docstrings = "\n".join(lines[(i + 1) :])
 
     # Add the return introduction
-    intro = RETURN_INTRODUCTION.format(output_type=output_type.__name__, config_class=config_class)
+    full_output_type = f"{output_type.__module__}.{output_type.__name__}"
+    intro = RETURN_INTRODUCTION.format(full_output_type=full_output_type, config_class=config_class)
     return intro + docstrings
 
 
@@ -857,14 +858,24 @@ def tf_required(func):
 
 class ModelOutput:
     """
-    Base class for all model outputs as dataclass. Has a ``__getitem__`` (to make it behave like a ``namedtuple``) that
-    will ignore ``None`` in the attributes.
+    Base class for all model outputs as dataclass. Has a ``__getitem__`` that allows indexing by integer or slice (like
+    a tuple) or strings (like a dictionnary) that will ignore the ``None`` attributes.
     """
 
     def to_tuple(self):
+        """
+        Converts :obj:`self` to a tuple.
+
+        Return: A tuple containing all non-:obj:`None` attributes of the :obj:`self`.
+        """
         return tuple(getattr(self, f) for f in self.__dataclass_fields__.keys() if getattr(self, f, None) is not None)
 
     def to_dict(self):
+        """
+        Converts :obj:`self` to a Python dictionary.
+
+        Return: A dictionary containing all non-:obj:`None` attributes of the :obj:`self`.
+        """
         return {f: getattr(self, f) for f in self.__dataclass_fields__.keys() if getattr(self, f, None) is not None}
 
     def __getitem__(self, i):
diff --git a/src/transformers/modeling_albert.py b/src/transformers/modeling_albert.py
index 9f7f4fb96..c1f1f73c7 100644
--- a/src/transformers/modeling_albert.py
+++ b/src/transformers/modeling_albert.py
@@ -410,9 +410,9 @@ class AlbertForPretrainingOutput(ModelOutput):
     Output type of :class:`~transformers.AlbertForPretrainingModel`.
 
     Args:
-        loss (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+        loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
             Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss.
-        prediction_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
+        prediction_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
             Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
         sop_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
             Prediction scores of the next sequence prediction (classification) head (scores of True/False
diff --git a/src/transformers/modeling_bert.py b/src/transformers/modeling_bert.py
index 34aa5d167..d2f6c3710 100644
--- a/src/transformers/modeling_bert.py
+++ b/src/transformers/modeling_bert.py
@@ -585,9 +585,9 @@ class BertForPretrainingOutput(ModelOutput):
     Output type of :class:`~transformers.BertForPretrainingModel`.
 
     Args:
-        loss (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+        loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
             Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss.
-        prediction_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
+        prediction_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
             Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
         seq_relationship_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
             Prediction scores of the next sequence prediction (classification) head (scores of True/False
diff --git a/src/transformers/modeling_electra.py b/src/transformers/modeling_electra.py
index 71d02d7d9..267dbea7d 100644
--- a/src/transformers/modeling_electra.py
+++ b/src/transformers/modeling_electra.py
@@ -191,9 +191,9 @@ class ElectraForPretrainingOutput(ModelOutput):
     Output type of :class:`~transformers.ElectraForPretrainingModel`.
 
     Args:
-        loss (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+        loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
             Total loss of the ELECTRA objective.
-        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`)
+        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`):
             Prediction scores of the head (scores for each token before SoftMax).
         hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
             Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
diff --git a/src/transformers/modeling_mobilebert.py b/src/transformers/modeling_mobilebert.py
index cb52395ce..b62035f54 100644
--- a/src/transformers/modeling_mobilebert.py
+++ b/src/transformers/modeling_mobilebert.py
@@ -685,9 +685,9 @@ class MobileBertForPretrainingOutput(ModelOutput):
     Output type of :class:`~transformers.MobileBertForPretrainingModel`.
 
     Args:
-        loss (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+        loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
             Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss.
-        prediction_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
+        prediction_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
             Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
         seq_relationship_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
             Prediction scores of the next sequence prediction (classification) head (scores of True/False
diff --git a/src/transformers/modeling_outputs.py b/src/transformers/modeling_outputs.py
index 0ef88f2cd..f9cf15c40 100644
--- a/src/transformers/modeling_outputs.py
+++ b/src/transformers/modeling_outputs.py
@@ -40,12 +40,11 @@ class BaseModelOutputWithPooling(ModelOutput):
     Args:
         last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
             Sequence of hidden-states at the output of the last layer of the model.
-        pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`):
-            pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`):
+        pooler_output (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, hidden_size)`):
             Last layer hidden-state of the first token of the sequence (classification token)
             further processed by a Linear layer and a Tanh activation function. The Linear
             layer weights are trained from the next sentence prediction (classification)
-            objective during pre-training.
+            objective during pretraining.
 
             This output is usually *not* a good summary
             of the semantic content of the input, you're often better with averaging or pooling
@@ -114,7 +113,7 @@ class Seq2SeqModelOutput(ModelOutput):
         last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
             Sequence of hidden-states at the output of the last layer of the decoder of the model.
 
-            If `decoder_past_key_values` is used only the last hidden-state of the sequences of shape :obj:`(batch_size, 1, hidden_size)` is output.
+            If ``decoder_past_key_values`` is used only the last hidden-state of the sequences of shape :obj:`(batch_size, 1, hidden_size)` is output.
         decoder_past_key_values (:obj:`List[torch.FloatTensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
             List of :obj:`torch.FloatTensor` of length :obj:`config.n_layers`,  with each tensor of shape
             :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`).
@@ -337,7 +336,7 @@ class SequenceClassifierOutput(ModelOutput):
     Base class for outputs of sentence classification models.
 
     Args:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
+        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
             Classification (or regression if config.num_labels==1) loss.
         logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
             Classification (or regression if config.num_labels==1) scores (before SoftMax).