Fix VisualBERT docs (#13106)

* Fix VisualBERT docs * Show example notebooks as lists * Fix style
2026-05-14 20:58:08 +00:00 · 2021-08-13 11:44:04 +05:30 · 2021-08-13 11:44:04 +05:30 · bda1cb0236
commit bda1cb0236
parent e46ad22cd6
2 changed files with 40 additions and 25 deletions
--- a/docs/source/model_doc/visual_bert.rst
+++ b/docs/source/model_doc/visual_bert.rst
@ -58,9 +58,17 @@ layer, and is expected to be bound by [CLS] and a [SEP] tokens, as in BERT. The
 appropriately for the textual and visual parts.

 The :class:`~transformers.BertTokenizer` is used to encode the text. A custom detector/feature extractor must be used
-to get the visual embeddings. For an example on how to generate visual embeddings, see the `colab notebook
-<https://colab.research.google.com/drive/1bLGxKdldwqnMVA5x4neY7-l_8fKGWQYI?usp=sharing>`__. The following example shows
-how to get the last hidden state using :class:`~transformers.VisualBertModel`:
+to get the visual embeddings. The following example notebooks show how to use VisualBERT with Detectron-like models:
+
+* `VisualBERT VQA demo notebook
+  <https://github.com/huggingface/transformers/tree/master/examples/research_projects/visual_bert>`__ : This notebook
+  contains an example on VisualBERT VQA.
+
+* `Generate Embeddings for VisualBERT (Colab Notebook)
+  <https://colab.research.google.com/drive/1bLGxKdldwqnMVA5x4neY7-l_8fKGWQYI?usp=sharing>`__ : This notebook contains
+  an example on how to generate visual embeddings.
+
+The following example shows how to get the last hidden state using :class:`~transformers.VisualBertModel`:

 .. code-block::

@ -74,6 +82,13 @@ how to get the last hidden state using :class:`~transformers.VisualBertModel`:
        >>> # this is a custom function that returns the visual embeddings given the image path
        >>> visual_embeds = get_visual_embeddings(image_path)

+        >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
+        >>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
+        >>> inputs.update({
+        ...     "visual_embeds": visual_embeds,
+        ...     "visual_token_type_ids": visual_token_type_ids,
+        ...     "visual_attention_mask": visual_attention_mask
+        ... })
        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state

--- a/src/transformers/models/visual_bert/modeling_visual_bert.py
+++ b/src/transformers/models/visual_bert/modeling_visual_bert.py
@ -743,14 +743,14 @@ class VisualBertModel(VisualBertPreTrainedModel):

            >>> inputs = tokenizer("The capital of France is Paris.", return_tensors="pt")
            >>> visual_embeds = get_visual_embeddings(image).unsqueeze(0)
-            >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long) #example
+            >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
            >>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)

-            >>> inputs.update({{
+            >>> inputs.update({
            ...     "visual_embeds": visual_embeds,
            ...     "visual_token_type_ids": visual_token_type_ids,
            ...     "visual_attention_mask": visual_attention_mask
-            ... }})
+            ... })

            >>> outputs = model(**inputs)

@ -923,14 +923,14 @@ class VisualBertForPreTraining(VisualBertPreTrainedModel):

            >>> inputs = tokenizer("The capital of France is {mask}.", return_tensors="pt")
            >>> visual_embeds = get_visual_embeddings(image).unsqueeze(0)
-            >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long) #example
+            >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
            >>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)

-            >>> inputs.update({{
+            >>> inputs.update({
            ...     "visual_embeds": visual_embeds,
            ...     "visual_token_type_ids": visual_token_type_ids,
            ...     "visual_attention_mask": visual_attention_mask
-            ... }})
+            ... })
            >>> max_length  = inputs["input_ids"].shape[-1]+visual_embeds.shape[-2]
            >>> labels = tokenizer("The capital of France is Paris.", return_tensors="pt", padding="max_length", max_length=max_length)["input_ids"]
            >>> sentence_image_labels = torch.tensor(1).unsqueeze(0) # Batch_size
@ -1068,13 +1068,13 @@ class VisualBertForMultipleChoice(VisualBertPreTrainedModel):

            >>> encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='pt', padding=True)
            >>> # batch size is 1
-            >>> inputs_dict = {{k: v.unsqueeze(0) for k,v in encoding.items()}}
-            >>> inputs_dict.update({{
-            ... visual_embeds=visual_embeds,
-            ... visual_attention_mask=visual_attention_mask,
-            ... visual_token_type_ids=visual_token_type_ids,
-            ... labels=labels
-            ... }})
+            >>> inputs_dict = {k: v.unsqueeze(0) for k,v in encoding.items()}
+            >>> inputs_dict.update({
+            ...     "visual_embeds": visual_embeds,
+            ...     "visual_attention_mask": visual_attention_mask,
+            ...     "visual_token_type_ids": visual_token_type_ids,
+            ...     "labels": labels
+            ... })
            >>> outputs = model(**inputs_dict)

            >>> loss = outputs.loss
@ -1204,14 +1204,14 @@ class VisualBertForQuestionAnswering(VisualBertPreTrainedModel):
            >>> text = "Who is eating the apple?"
            >>> inputs = tokenizer(text, return_tensors='pt')
            >>> visual_embeds = get_visual_embeddings(image).unsqueeze(0)
-            >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long) #example
+            >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
            >>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)

-            >>> inputs.update({{
+            >>> inputs.update({
            ...     "visual_embeds": visual_embeds,
            ...     "visual_token_type_ids": visual_token_type_ids,
            ...     "visual_attention_mask": visual_attention_mask
-            ... }})
+            ... })

            >>> labels = torch.tensor([[0.0,1.0]]).unsqueeze(0)  # Batch size 1, Num labels 2

@ -1326,14 +1326,14 @@ class VisualBertForVisualReasoning(VisualBertPreTrainedModel):
            >>> text = "Who is eating the apple?"
            >>> inputs = tokenizer(text, return_tensors='pt')
            >>> visual_embeds = get_visual_embeddings(image).unsqueeze(0)
-            >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long) #example
+            >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
            >>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)

-            >>> inputs.update({{
+            >>> inputs.update({
            ...     "visual_embeds": visual_embeds,
            ...     "visual_token_type_ids": visual_token_type_ids,
            ...     "visual_attention_mask": visual_attention_mask
-            ... }})
+            ... })

            >>> labels = torch.tensor(1).unsqueeze(0)  # Batch size 1, Num choices 2

@ -1486,16 +1486,16 @@ class VisualBertForRegionToPhraseAlignment(VisualBertPreTrainedModel):
            >>> text = "Who is eating the apple?"
            >>> inputs = tokenizer(text, return_tensors='pt')
            >>> visual_embeds = get_visual_embeddings(image).unsqueeze(0)
-            >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long) #example
+            >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
            >>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
            >>> region_to_phrase_position = torch.ones((1, inputs["input_ids"].shape[-1]+visual_embeds.shape[-2]))

-            >>> inputs.update({{
+            >>> inputs.update({
            ...     "region_to_phrase_position": region_to_phrase_position,
            ...     "visual_embeds": visual_embeds,
            ...     "visual_token_type_ids": visual_token_type_ids,
            ...     "visual_attention_mask": visual_attention_mask
-            ... }})
+            ... })

            >>> labels = torch.ones((1, inputs["input_ids"].shape[-1]+visual_embeds.shape[-2], visual_embeds.shape[-2])) # Batch size 1