diff --git a/docs/source/model_doc/visual_bert.rst b/docs/source/model_doc/visual_bert.rst
index 179b2e4a4..7258b12ad 100644
--- a/docs/source/model_doc/visual_bert.rst
+++ b/docs/source/model_doc/visual_bert.rst
@@ -58,9 +58,17 @@ layer, and is expected to be bound by [CLS] and a [SEP] tokens, as in BERT. The
 appropriately for the textual and visual parts.
 
 The :class:`~transformers.BertTokenizer` is used to encode the text. A custom detector/feature extractor must be used
-to get the visual embeddings. For an example on how to generate visual embeddings, see the `colab notebook
-<https://colab.research.google.com/drive/1bLGxKdldwqnMVA5x4neY7-l_8fKGWQYI?usp=sharing>`__. The following example shows
-how to get the last hidden state using :class:`~transformers.VisualBertModel`:
+to get the visual embeddings. The following example notebooks show how to use VisualBERT with Detectron-like models:
+
+* `VisualBERT VQA demo notebook
+  <https://github.com/huggingface/transformers/tree/master/examples/research_projects/visual_bert>`__ : This notebook
+  contains an example on VisualBERT VQA.
+
+* `Generate Embeddings for VisualBERT (Colab Notebook)
+  <https://colab.research.google.com/drive/1bLGxKdldwqnMVA5x4neY7-l_8fKGWQYI?usp=sharing>`__ : This notebook contains
+  an example on how to generate visual embeddings.
+
+The following example shows how to get the last hidden state using :class:`~transformers.VisualBertModel`:
 
 .. code-block::
 
@@ -74,6 +82,13 @@ how to get the last hidden state using :class:`~transformers.VisualBertModel`:
         >>> # this is a custom function that returns the visual embeddings given the image path
         >>> visual_embeds = get_visual_embeddings(image_path)
 
+        >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
+        >>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
+        >>> inputs.update({
+        ...     "visual_embeds": visual_embeds,
+        ...     "visual_token_type_ids": visual_token_type_ids,
+        ...     "visual_attention_mask": visual_attention_mask
+        ... })
         >>> outputs = model(**inputs)
         >>> last_hidden_state = outputs.last_hidden_state
 
diff --git a/src/transformers/models/visual_bert/modeling_visual_bert.py b/src/transformers/models/visual_bert/modeling_visual_bert.py
index f7aea7851..c4cdda9e0 100755
--- a/src/transformers/models/visual_bert/modeling_visual_bert.py
+++ b/src/transformers/models/visual_bert/modeling_visual_bert.py
@@ -743,14 +743,14 @@ class VisualBertModel(VisualBertPreTrainedModel):
 
             >>> inputs = tokenizer("The capital of France is Paris.", return_tensors="pt")
             >>> visual_embeds = get_visual_embeddings(image).unsqueeze(0)
-            >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long) #example
+            >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
             >>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
 
-            >>> inputs.update({{
+            >>> inputs.update({
             ...     "visual_embeds": visual_embeds,
             ...     "visual_token_type_ids": visual_token_type_ids,
             ...     "visual_attention_mask": visual_attention_mask
-            ... }})
+            ... })
 
             >>> outputs = model(**inputs)
 
@@ -923,14 +923,14 @@ class VisualBertForPreTraining(VisualBertPreTrainedModel):
 
             >>> inputs = tokenizer("The capital of France is {mask}.", return_tensors="pt")
             >>> visual_embeds = get_visual_embeddings(image).unsqueeze(0)
-            >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long) #example
+            >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
             >>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
 
-            >>> inputs.update({{
+            >>> inputs.update({
             ...     "visual_embeds": visual_embeds,
             ...     "visual_token_type_ids": visual_token_type_ids,
             ...     "visual_attention_mask": visual_attention_mask
-            ... }})
+            ... })
             >>> max_length  = inputs["input_ids"].shape[-1]+visual_embeds.shape[-2]
             >>> labels = tokenizer("The capital of France is Paris.", return_tensors="pt", padding="max_length", max_length=max_length)["input_ids"]
             >>> sentence_image_labels = torch.tensor(1).unsqueeze(0) # Batch_size
@@ -1068,13 +1068,13 @@ class VisualBertForMultipleChoice(VisualBertPreTrainedModel):
 
             >>> encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='pt', padding=True)
             >>> # batch size is 1
-            >>> inputs_dict = {{k: v.unsqueeze(0) for k,v in encoding.items()}}
-            >>> inputs_dict.update({{
-            ... visual_embeds=visual_embeds,
-            ... visual_attention_mask=visual_attention_mask,
-            ... visual_token_type_ids=visual_token_type_ids,
-            ... labels=labels
-            ... }})
+            >>> inputs_dict = {k: v.unsqueeze(0) for k,v in encoding.items()}
+            >>> inputs_dict.update({
+            ...     "visual_embeds": visual_embeds,
+            ...     "visual_attention_mask": visual_attention_mask,
+            ...     "visual_token_type_ids": visual_token_type_ids,
+            ...     "labels": labels
+            ... })
             >>> outputs = model(**inputs_dict)
 
             >>> loss = outputs.loss
@@ -1204,14 +1204,14 @@ class VisualBertForQuestionAnswering(VisualBertPreTrainedModel):
             >>> text = "Who is eating the apple?"
             >>> inputs = tokenizer(text, return_tensors='pt')
             >>> visual_embeds = get_visual_embeddings(image).unsqueeze(0)
-            >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long) #example
+            >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
             >>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
 
-            >>> inputs.update({{
+            >>> inputs.update({
             ...     "visual_embeds": visual_embeds,
             ...     "visual_token_type_ids": visual_token_type_ids,
             ...     "visual_attention_mask": visual_attention_mask
-            ... }})
+            ... })
 
             >>> labels = torch.tensor([[0.0,1.0]]).unsqueeze(0)  # Batch size 1, Num labels 2
 
@@ -1326,14 +1326,14 @@ class VisualBertForVisualReasoning(VisualBertPreTrainedModel):
             >>> text = "Who is eating the apple?"
             >>> inputs = tokenizer(text, return_tensors='pt')
             >>> visual_embeds = get_visual_embeddings(image).unsqueeze(0)
-            >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long) #example
+            >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
             >>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
 
-            >>> inputs.update({{
+            >>> inputs.update({
             ...     "visual_embeds": visual_embeds,
             ...     "visual_token_type_ids": visual_token_type_ids,
             ...     "visual_attention_mask": visual_attention_mask
-            ... }})
+            ... })
 
             >>> labels = torch.tensor(1).unsqueeze(0)  # Batch size 1, Num choices 2
 
@@ -1486,16 +1486,16 @@ class VisualBertForRegionToPhraseAlignment(VisualBertPreTrainedModel):
             >>> text = "Who is eating the apple?"
             >>> inputs = tokenizer(text, return_tensors='pt')
             >>> visual_embeds = get_visual_embeddings(image).unsqueeze(0)
-            >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long) #example
+            >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
             >>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
             >>> region_to_phrase_position = torch.ones((1, inputs["input_ids"].shape[-1]+visual_embeds.shape[-2]))
 
-            >>> inputs.update({{
+            >>> inputs.update({
             ...     "region_to_phrase_position": region_to_phrase_position,
             ...     "visual_embeds": visual_embeds,
             ...     "visual_token_type_ids": visual_token_type_ids,
             ...     "visual_attention_mask": visual_attention_mask
-            ... }})
+            ... })
 
             >>> labels = torch.ones((1, inputs["input_ids"].shape[-1]+visual_embeds.shape[-2], visual_embeds.shape[-2])) # Batch size 1