diff --git a/docs/source/model_doc/visual_bert.rst b/docs/source/model_doc/visual_bert.rst index 179b2e4a4..7258b12ad 100644 --- a/docs/source/model_doc/visual_bert.rst +++ b/docs/source/model_doc/visual_bert.rst @@ -58,9 +58,17 @@ layer, and is expected to be bound by [CLS] and a [SEP] tokens, as in BERT. The appropriately for the textual and visual parts. The :class:`~transformers.BertTokenizer` is used to encode the text. A custom detector/feature extractor must be used -to get the visual embeddings. For an example on how to generate visual embeddings, see the `colab notebook -`__. The following example shows -how to get the last hidden state using :class:`~transformers.VisualBertModel`: +to get the visual embeddings. The following example notebooks show how to use VisualBERT with Detectron-like models: + +* `VisualBERT VQA demo notebook + `__ : This notebook + contains an example on VisualBERT VQA. + +* `Generate Embeddings for VisualBERT (Colab Notebook) + `__ : This notebook contains + an example on how to generate visual embeddings. + +The following example shows how to get the last hidden state using :class:`~transformers.VisualBertModel`: .. code-block:: @@ -74,6 +82,13 @@ how to get the last hidden state using :class:`~transformers.VisualBertModel`: >>> # this is a custom function that returns the visual embeddings given the image path >>> visual_embeds = get_visual_embeddings(image_path) + >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long) + >>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float) + >>> inputs.update({ + ... "visual_embeds": visual_embeds, + ... "visual_token_type_ids": visual_token_type_ids, + ... "visual_attention_mask": visual_attention_mask + ... }) >>> outputs = model(**inputs) >>> last_hidden_state = outputs.last_hidden_state diff --git a/src/transformers/models/visual_bert/modeling_visual_bert.py b/src/transformers/models/visual_bert/modeling_visual_bert.py index f7aea7851..c4cdda9e0 100755 --- a/src/transformers/models/visual_bert/modeling_visual_bert.py +++ b/src/transformers/models/visual_bert/modeling_visual_bert.py @@ -743,14 +743,14 @@ class VisualBertModel(VisualBertPreTrainedModel): >>> inputs = tokenizer("The capital of France is Paris.", return_tensors="pt") >>> visual_embeds = get_visual_embeddings(image).unsqueeze(0) - >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long) #example + >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long) >>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float) - >>> inputs.update({{ + >>> inputs.update({ ... "visual_embeds": visual_embeds, ... "visual_token_type_ids": visual_token_type_ids, ... "visual_attention_mask": visual_attention_mask - ... }}) + ... }) >>> outputs = model(**inputs) @@ -923,14 +923,14 @@ class VisualBertForPreTraining(VisualBertPreTrainedModel): >>> inputs = tokenizer("The capital of France is {mask}.", return_tensors="pt") >>> visual_embeds = get_visual_embeddings(image).unsqueeze(0) - >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long) #example + >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long) >>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float) - >>> inputs.update({{ + >>> inputs.update({ ... "visual_embeds": visual_embeds, ... "visual_token_type_ids": visual_token_type_ids, ... "visual_attention_mask": visual_attention_mask - ... }}) + ... }) >>> max_length = inputs["input_ids"].shape[-1]+visual_embeds.shape[-2] >>> labels = tokenizer("The capital of France is Paris.", return_tensors="pt", padding="max_length", max_length=max_length)["input_ids"] >>> sentence_image_labels = torch.tensor(1).unsqueeze(0) # Batch_size @@ -1068,13 +1068,13 @@ class VisualBertForMultipleChoice(VisualBertPreTrainedModel): >>> encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='pt', padding=True) >>> # batch size is 1 - >>> inputs_dict = {{k: v.unsqueeze(0) for k,v in encoding.items()}} - >>> inputs_dict.update({{ - ... visual_embeds=visual_embeds, - ... visual_attention_mask=visual_attention_mask, - ... visual_token_type_ids=visual_token_type_ids, - ... labels=labels - ... }}) + >>> inputs_dict = {k: v.unsqueeze(0) for k,v in encoding.items()} + >>> inputs_dict.update({ + ... "visual_embeds": visual_embeds, + ... "visual_attention_mask": visual_attention_mask, + ... "visual_token_type_ids": visual_token_type_ids, + ... "labels": labels + ... }) >>> outputs = model(**inputs_dict) >>> loss = outputs.loss @@ -1204,14 +1204,14 @@ class VisualBertForQuestionAnswering(VisualBertPreTrainedModel): >>> text = "Who is eating the apple?" >>> inputs = tokenizer(text, return_tensors='pt') >>> visual_embeds = get_visual_embeddings(image).unsqueeze(0) - >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long) #example + >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long) >>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float) - >>> inputs.update({{ + >>> inputs.update({ ... "visual_embeds": visual_embeds, ... "visual_token_type_ids": visual_token_type_ids, ... "visual_attention_mask": visual_attention_mask - ... }}) + ... }) >>> labels = torch.tensor([[0.0,1.0]]).unsqueeze(0) # Batch size 1, Num labels 2 @@ -1326,14 +1326,14 @@ class VisualBertForVisualReasoning(VisualBertPreTrainedModel): >>> text = "Who is eating the apple?" >>> inputs = tokenizer(text, return_tensors='pt') >>> visual_embeds = get_visual_embeddings(image).unsqueeze(0) - >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long) #example + >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long) >>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float) - >>> inputs.update({{ + >>> inputs.update({ ... "visual_embeds": visual_embeds, ... "visual_token_type_ids": visual_token_type_ids, ... "visual_attention_mask": visual_attention_mask - ... }}) + ... }) >>> labels = torch.tensor(1).unsqueeze(0) # Batch size 1, Num choices 2 @@ -1486,16 +1486,16 @@ class VisualBertForRegionToPhraseAlignment(VisualBertPreTrainedModel): >>> text = "Who is eating the apple?" >>> inputs = tokenizer(text, return_tensors='pt') >>> visual_embeds = get_visual_embeddings(image).unsqueeze(0) - >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long) #example + >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long) >>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float) >>> region_to_phrase_position = torch.ones((1, inputs["input_ids"].shape[-1]+visual_embeds.shape[-2])) - >>> inputs.update({{ + >>> inputs.update({ ... "region_to_phrase_position": region_to_phrase_position, ... "visual_embeds": visual_embeds, ... "visual_token_type_ids": visual_token_type_ids, ... "visual_attention_mask": visual_attention_mask - ... }}) + ... }) >>> labels = torch.ones((1, inputs["input_ids"].shape[-1]+visual_embeds.shape[-2], visual_embeds.shape[-2])) # Batch size 1