mirror of
https://github.com/saymrwulf/transformers.git
synced 2026-05-14 20:58:08 +00:00
Fix VisualBERT docs (#13106)
* Fix VisualBERT docs * Show example notebooks as lists * Fix style
This commit is contained in:
parent
e46ad22cd6
commit
bda1cb0236
2 changed files with 40 additions and 25 deletions
|
|
@ -58,9 +58,17 @@ layer, and is expected to be bound by [CLS] and a [SEP] tokens, as in BERT. The
|
|||
appropriately for the textual and visual parts.
|
||||
|
||||
The :class:`~transformers.BertTokenizer` is used to encode the text. A custom detector/feature extractor must be used
|
||||
to get the visual embeddings. For an example on how to generate visual embeddings, see the `colab notebook
|
||||
<https://colab.research.google.com/drive/1bLGxKdldwqnMVA5x4neY7-l_8fKGWQYI?usp=sharing>`__. The following example shows
|
||||
how to get the last hidden state using :class:`~transformers.VisualBertModel`:
|
||||
to get the visual embeddings. The following example notebooks show how to use VisualBERT with Detectron-like models:
|
||||
|
||||
* `VisualBERT VQA demo notebook
|
||||
<https://github.com/huggingface/transformers/tree/master/examples/research_projects/visual_bert>`__ : This notebook
|
||||
contains an example on VisualBERT VQA.
|
||||
|
||||
* `Generate Embeddings for VisualBERT (Colab Notebook)
|
||||
<https://colab.research.google.com/drive/1bLGxKdldwqnMVA5x4neY7-l_8fKGWQYI?usp=sharing>`__ : This notebook contains
|
||||
an example on how to generate visual embeddings.
|
||||
|
||||
The following example shows how to get the last hidden state using :class:`~transformers.VisualBertModel`:
|
||||
|
||||
.. code-block::
|
||||
|
||||
|
|
@ -74,6 +82,13 @@ how to get the last hidden state using :class:`~transformers.VisualBertModel`:
|
|||
>>> # this is a custom function that returns the visual embeddings given the image path
|
||||
>>> visual_embeds = get_visual_embeddings(image_path)
|
||||
|
||||
>>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
|
||||
>>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
|
||||
>>> inputs.update({
|
||||
... "visual_embeds": visual_embeds,
|
||||
... "visual_token_type_ids": visual_token_type_ids,
|
||||
... "visual_attention_mask": visual_attention_mask
|
||||
... })
|
||||
>>> outputs = model(**inputs)
|
||||
>>> last_hidden_state = outputs.last_hidden_state
|
||||
|
||||
|
|
|
|||
|
|
@ -743,14 +743,14 @@ class VisualBertModel(VisualBertPreTrainedModel):
|
|||
|
||||
>>> inputs = tokenizer("The capital of France is Paris.", return_tensors="pt")
|
||||
>>> visual_embeds = get_visual_embeddings(image).unsqueeze(0)
|
||||
>>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long) #example
|
||||
>>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
|
||||
>>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
|
||||
|
||||
>>> inputs.update({{
|
||||
>>> inputs.update({
|
||||
... "visual_embeds": visual_embeds,
|
||||
... "visual_token_type_ids": visual_token_type_ids,
|
||||
... "visual_attention_mask": visual_attention_mask
|
||||
... }})
|
||||
... })
|
||||
|
||||
>>> outputs = model(**inputs)
|
||||
|
||||
|
|
@ -923,14 +923,14 @@ class VisualBertForPreTraining(VisualBertPreTrainedModel):
|
|||
|
||||
>>> inputs = tokenizer("The capital of France is {mask}.", return_tensors="pt")
|
||||
>>> visual_embeds = get_visual_embeddings(image).unsqueeze(0)
|
||||
>>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long) #example
|
||||
>>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
|
||||
>>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
|
||||
|
||||
>>> inputs.update({{
|
||||
>>> inputs.update({
|
||||
... "visual_embeds": visual_embeds,
|
||||
... "visual_token_type_ids": visual_token_type_ids,
|
||||
... "visual_attention_mask": visual_attention_mask
|
||||
... }})
|
||||
... })
|
||||
>>> max_length = inputs["input_ids"].shape[-1]+visual_embeds.shape[-2]
|
||||
>>> labels = tokenizer("The capital of France is Paris.", return_tensors="pt", padding="max_length", max_length=max_length)["input_ids"]
|
||||
>>> sentence_image_labels = torch.tensor(1).unsqueeze(0) # Batch_size
|
||||
|
|
@ -1068,13 +1068,13 @@ class VisualBertForMultipleChoice(VisualBertPreTrainedModel):
|
|||
|
||||
>>> encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='pt', padding=True)
|
||||
>>> # batch size is 1
|
||||
>>> inputs_dict = {{k: v.unsqueeze(0) for k,v in encoding.items()}}
|
||||
>>> inputs_dict.update({{
|
||||
... visual_embeds=visual_embeds,
|
||||
... visual_attention_mask=visual_attention_mask,
|
||||
... visual_token_type_ids=visual_token_type_ids,
|
||||
... labels=labels
|
||||
... }})
|
||||
>>> inputs_dict = {k: v.unsqueeze(0) for k,v in encoding.items()}
|
||||
>>> inputs_dict.update({
|
||||
... "visual_embeds": visual_embeds,
|
||||
... "visual_attention_mask": visual_attention_mask,
|
||||
... "visual_token_type_ids": visual_token_type_ids,
|
||||
... "labels": labels
|
||||
... })
|
||||
>>> outputs = model(**inputs_dict)
|
||||
|
||||
>>> loss = outputs.loss
|
||||
|
|
@ -1204,14 +1204,14 @@ class VisualBertForQuestionAnswering(VisualBertPreTrainedModel):
|
|||
>>> text = "Who is eating the apple?"
|
||||
>>> inputs = tokenizer(text, return_tensors='pt')
|
||||
>>> visual_embeds = get_visual_embeddings(image).unsqueeze(0)
|
||||
>>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long) #example
|
||||
>>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
|
||||
>>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
|
||||
|
||||
>>> inputs.update({{
|
||||
>>> inputs.update({
|
||||
... "visual_embeds": visual_embeds,
|
||||
... "visual_token_type_ids": visual_token_type_ids,
|
||||
... "visual_attention_mask": visual_attention_mask
|
||||
... }})
|
||||
... })
|
||||
|
||||
>>> labels = torch.tensor([[0.0,1.0]]).unsqueeze(0) # Batch size 1, Num labels 2
|
||||
|
||||
|
|
@ -1326,14 +1326,14 @@ class VisualBertForVisualReasoning(VisualBertPreTrainedModel):
|
|||
>>> text = "Who is eating the apple?"
|
||||
>>> inputs = tokenizer(text, return_tensors='pt')
|
||||
>>> visual_embeds = get_visual_embeddings(image).unsqueeze(0)
|
||||
>>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long) #example
|
||||
>>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
|
||||
>>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
|
||||
|
||||
>>> inputs.update({{
|
||||
>>> inputs.update({
|
||||
... "visual_embeds": visual_embeds,
|
||||
... "visual_token_type_ids": visual_token_type_ids,
|
||||
... "visual_attention_mask": visual_attention_mask
|
||||
... }})
|
||||
... })
|
||||
|
||||
>>> labels = torch.tensor(1).unsqueeze(0) # Batch size 1, Num choices 2
|
||||
|
||||
|
|
@ -1486,16 +1486,16 @@ class VisualBertForRegionToPhraseAlignment(VisualBertPreTrainedModel):
|
|||
>>> text = "Who is eating the apple?"
|
||||
>>> inputs = tokenizer(text, return_tensors='pt')
|
||||
>>> visual_embeds = get_visual_embeddings(image).unsqueeze(0)
|
||||
>>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long) #example
|
||||
>>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
|
||||
>>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
|
||||
>>> region_to_phrase_position = torch.ones((1, inputs["input_ids"].shape[-1]+visual_embeds.shape[-2]))
|
||||
|
||||
>>> inputs.update({{
|
||||
>>> inputs.update({
|
||||
... "region_to_phrase_position": region_to_phrase_position,
|
||||
... "visual_embeds": visual_embeds,
|
||||
... "visual_token_type_ids": visual_token_type_ids,
|
||||
... "visual_attention_mask": visual_attention_mask
|
||||
... }})
|
||||
... })
|
||||
|
||||
>>> labels = torch.ones((1, inputs["input_ids"].shape[-1]+visual_embeds.shape[-2], visual_embeds.shape[-2])) # Batch size 1
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue