diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index e6a3d912b..98a458e11 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -248,7 +248,7 @@ Flax), PyTorch, and/or TensorFlow.
| GPT NeoX | ❌ | ✅ | ✅ | ❌ | ❌ |
| GPT NeoX Japanese | ✅ | ❌ | ✅ | ❌ | ❌ |
| GPT-J | ❌ | ❌ | ✅ | ✅ | ✅ |
-| GroupViT | ❌ | ❌ | ✅ | ❌ | ❌ |
+| GroupViT | ❌ | ❌ | ✅ | ✅ | ❌ |
| Hubert | ❌ | ❌ | ✅ | ✅ | ❌ |
| I-BERT | ❌ | ❌ | ✅ | ❌ | ❌ |
| ImageGPT | ❌ | ❌ | ✅ | ❌ | ❌ |
diff --git a/docs/source/en/model_doc/groupvit.mdx b/docs/source/en/model_doc/groupvit.mdx
index bad55ea28..8c955a2e3 100644
--- a/docs/source/en/model_doc/groupvit.mdx
+++ b/docs/source/en/model_doc/groupvit.mdx
@@ -26,7 +26,7 @@ Tips:
- You may specify `output_segmentation=True` in the forward of `GroupViTModel` to get the segmentation logits of input texts.
- The quickest way to get started with GroupViT is by checking the [example notebooks](https://github.com/xvjiarui/GroupViT/blob/main/demo/GroupViT_hf_inference_notebook.ipynb) (which showcase zero-shot segmentation inference). One can also check out the [HuggingFace Spaces demo](https://huggingface.co/spaces/xvjiarui/GroupViT) to play with GroupViT.
-This model was contributed by [xvjiarui](https://huggingface.co/xvjiarui).
+This model was contributed by [xvjiarui](https://huggingface.co/xvjiarui). The TensorFlow version was contributed by [ariG23498](https://huggingface.co/ariG23498) with the help of [Yih-Dar SHIEH](https://huggingface.co/ydshieh), [Amy Roberts](https://huggingface.co/amyeroberts), and [Joao Gante](https://huggingface.co/joaogante).
The original code can be found [here](https://github.com/NVlabs/GroupViT).
@@ -59,3 +59,20 @@ The original code can be found [here](https://github.com/NVlabs/GroupViT).
[[autodoc]] GroupViTVisionModel
- forward
+
+## TFGroupViTModel
+
+[[autodoc]] TFGroupViTModel
+ - call
+ - get_text_features
+ - get_image_features
+
+## TFGroupViTTextModel
+
+[[autodoc]] TFGroupViTTextModel
+ - call
+
+## TFGroupViTVisionModel
+
+[[autodoc]] TFGroupViTVisionModel
+ - call
\ No newline at end of file
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 50fb4d2c0..fb09d0af9 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -2417,6 +2417,15 @@ else:
"TFGPTJPreTrainedModel",
]
)
+ _import_structure["models.groupvit"].extend(
+ [
+ "TF_GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST",
+ "TFGroupViTModel",
+ "TFGroupViTPreTrainedModel",
+ "TFGroupViTTextModel",
+ "TFGroupViTVisionModel",
+ ]
+ )
_import_structure["models.hubert"].extend(
[
"TF_HUBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -4986,6 +4995,13 @@ if TYPE_CHECKING:
TFGPTJModel,
TFGPTJPreTrainedModel,
)
+ from .models.groupvit import (
+ TF_GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST,
+ TFGroupViTModel,
+ TFGroupViTPreTrainedModel,
+ TFGroupViTTextModel,
+ TFGroupViTVisionModel,
+ )
from .models.hubert import (
TF_HUBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
TFHubertForCTC,
diff --git a/src/transformers/models/auto/modeling_tf_auto.py b/src/transformers/models/auto/modeling_tf_auto.py
index 77ab03d38..e13a0754b 100644
--- a/src/transformers/models/auto/modeling_tf_auto.py
+++ b/src/transformers/models/auto/modeling_tf_auto.py
@@ -50,6 +50,7 @@ TF_MODEL_MAPPING_NAMES = OrderedDict(
("funnel", ("TFFunnelModel", "TFFunnelBaseModel")),
("gpt2", "TFGPT2Model"),
("gptj", "TFGPTJModel"),
+ ("groupvit", "TFGroupViTModel"),
("hubert", "TFHubertModel"),
("layoutlm", "TFLayoutLMModel"),
("layoutlmv3", "TFLayoutLMv3Model"),
diff --git a/src/transformers/models/groupvit/__init__.py b/src/transformers/models/groupvit/__init__.py
index 3985e9ecf..0e8b51fed 100644
--- a/src/transformers/models/groupvit/__init__.py
+++ b/src/transformers/models/groupvit/__init__.py
@@ -17,7 +17,7 @@
# limitations under the License.
from typing import TYPE_CHECKING
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tf_available, is_torch_available
_import_structure = {
@@ -44,6 +44,20 @@ else:
"GroupViTVisionModel",
]
+try:
+ if not is_tf_available():
+ raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+ pass
+else:
+ _import_structure["modeling_tf_groupvit"] = [
+ "TF_GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST",
+ "TFGroupViTModel",
+ "TFGroupViTPreTrainedModel",
+ "TFGroupViTTextModel",
+ "TFGroupViTVisionModel",
+ ]
+
if TYPE_CHECKING:
from .configuration_groupvit import (
GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -67,6 +81,20 @@ if TYPE_CHECKING:
GroupViTVisionModel,
)
+ try:
+ if not is_tf_available():
+ raise OptionalDependencyNotAvailable()
+ except OptionalDependencyNotAvailable:
+ pass
+ else:
+ from .modeling_tf_groupvit import (
+ TF_GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST,
+ TFGroupViTModel,
+ TFGroupViTPreTrainedModel,
+ TFGroupViTTextModel,
+ TFGroupViTVisionModel,
+ )
+
else:
import sys
diff --git a/src/transformers/models/groupvit/configuration_groupvit.py b/src/transformers/models/groupvit/configuration_groupvit.py
index 895c0608b..ea4282241 100644
--- a/src/transformers/models/groupvit/configuration_groupvit.py
+++ b/src/transformers/models/groupvit/configuration_groupvit.py
@@ -162,7 +162,7 @@ class GroupViTVisionConfig(PretrainedConfig):
The number of layers in each encoder block.
num_group_tokens (`List[int]`, *optional*, defaults to [64, 8, 0]):
The number of group tokens for each stage.
- num_output_groups (`List[int]`, *optional*, defaults to [64, 8, 0]):
+ num_output_groups (`List[int]`, *optional*, defaults to [64, 8, 8]):
The number of output groups for each stage, 0 means no group.
num_attention_heads (`int`, *optional*, defaults to 6):
Number of attention heads for each attention layer in the Transformer encoder.
diff --git a/src/transformers/models/groupvit/modeling_groupvit.py b/src/transformers/models/groupvit/modeling_groupvit.py
index 3d2f78e3c..210a848f2 100644
--- a/src/transformers/models/groupvit/modeling_groupvit.py
+++ b/src/transformers/models/groupvit/modeling_groupvit.py
@@ -1300,7 +1300,7 @@ class GroupViTVisionModel(GroupViTPreTrainedModel):
>>> import requests
>>> from transformers import AutoProcessor, GroupViTVisionModel
- >>> processor = AutoPProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")
+ >>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")
>>> model = GroupViTVisionModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
diff --git a/src/transformers/models/groupvit/modeling_tf_groupvit.py b/src/transformers/models/groupvit/modeling_tf_groupvit.py
new file mode 100644
index 000000000..481f065eb
--- /dev/null
+++ b/src/transformers/models/groupvit/modeling_tf_groupvit.py
@@ -0,0 +1,1993 @@
+# coding=utf-8
+# Copyright 2022 NVIDIA and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" TF 2.0 GroupViT model."""
+
+
+import collections.abc
+import math
+from dataclasses import dataclass
+from typing import Any, Dict, Optional, Tuple, Union
+
+import numpy as np
+import tensorflow as tf
+
+from ...activations_tf import get_tf_activation
+from ...modeling_tf_outputs import TFBaseModelOutput, TFBaseModelOutputWithPooling
+from ...modeling_tf_utils import (
+ DUMMY_INPUTS,
+ TFModelInputType,
+ TFPreTrainedModel,
+ get_initializer,
+ keras_serializable,
+ unpack_inputs,
+)
+from ...tf_utils import shape_list, stable_softmax
+from ...utils import (
+ ModelOutput,
+ add_start_docstrings,
+ add_start_docstrings_to_model_forward,
+ is_tensorflow_probability_available,
+ logging,
+ replace_return_docstrings,
+)
+from .configuration_groupvit import GroupViTConfig, GroupViTTextConfig, GroupViTVisionConfig
+
+
+logger = logging.get_logger(__name__)
+
+# soft dependency
+if is_tensorflow_probability_available():
+ try:
+ import tensorflow_probability as tfp
+
+ # On the first call, check whether a compatible version of TensorFlow is installed
+ # TensorFlow Probability depends on a recent stable release of TensorFlow
+ _ = tfp.distributions.Normal(loc=0.0, scale=1.0)
+ except ImportError:
+ logger.error(
+ "GroupViT models are not usable since `tensorflow_probability` can't be loaded."
+ "It seems you have `tensorflow_probability` installed with the wrong tensorflow version."
+ "Please try to reinstall it following the instructions here: https://github.com/tensorflow/probability."
+ )
+
+_CHECKPOINT_FOR_DOC = "nvidia/groupvit-gcc-yfcc"
+
+TF_GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST = [
+ "nvidia/groupvit-gcc-yfcc",
+ # See all GroupViT models at https://huggingface.co/models?filter=groupvit
+]
+
+
+LARGE_NEGATIVE = -1e8
+
+
+# Copied from transformers.models.bart.modeling_tf_bart._expand_mask
+def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None):
+ """
+ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+ """
+ src_len = shape_list(mask)[1]
+ tgt_len = tgt_len if tgt_len is not None else src_len
+ one_cst = tf.constant(1.0)
+ mask = tf.cast(mask, dtype=one_cst.dtype)
+ expanded_mask = tf.tile(mask[:, None, None, :], (1, 1, tgt_len, 1))
+
+ return (one_cst - expanded_mask) * LARGE_NEGATIVE
+
+
+# contrastive loss function, adapted from
+# https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/CLIP.html
+def contrastive_loss(logits: tf.Tensor) -> tf.Tensor:
+ return tf.math.reduce_mean(
+ tf.keras.metrics.sparse_categorical_crossentropy(
+ y_true=tf.range(shape_list(logits)[0]), y_pred=logits, from_logits=True
+ )
+ )
+
+
+# Copied from transformers.models.clip.modeling_tf_clip.clip_loss with clip->groupvit
+def groupvit_loss(similarity: tf.Tensor) -> tf.Tensor:
+ caption_loss = contrastive_loss(similarity)
+ image_loss = contrastive_loss(tf.transpose(similarity))
+ return (caption_loss + image_loss) / 2.0
+
+
+def hard_softmax(logits: tf.Tensor, dim: int) -> tf.Tensor:
+ y_soft = stable_softmax(logits, dim)
+ # Straight through.
+ index = tf.argmax(y_soft, dim)
+ y_hard = tf.one_hot(
+ index,
+ depth=shape_list(logits)[dim],
+ # TensorFlow expects axis to be -1 or between [0, 3). But received: -2
+ # This is why the following code snippet is used.
+ axis=range(len(shape_list(logits)))[dim],
+ dtype=y_soft.dtype,
+ )
+ ret = y_hard - tf.stop_gradient(y_soft) + y_soft
+
+ return ret
+
+
+def gumbel_softmax(logits: tf.Tensor, tau: float = 1, hard: bool = False, dim: int = -1) -> tf.Tensor:
+ gumbel_dist = tfp.distributions.Gumbel(0.0, 1.0)
+ gumbels = gumbel_dist.sample(tf.shape(logits), dtype=logits.dtype)
+
+ gumbels = (logits + gumbels) / tau # ~Gumbel(logits,tau)
+ y_soft = stable_softmax(gumbels, dim)
+
+ if hard:
+ # Straight through.
+ index = tf.argmax(y_soft, dim)
+ y_hard = tf.one_hot(
+ index,
+ depth=shape_list(logits)[dim],
+ # TensorFlow expects axis to be -1 or between [0, 3). But received: -2
+ # This is why the following code snippet is used.
+ axis=range(len(shape_list(logits)))[dim],
+ dtype=y_soft.dtype,
+ )
+ ret = y_hard - tf.stop_gradient(y_soft) + y_soft
+ else:
+ # Reparametrization trick.
+ ret = y_soft
+ return ret
+
+
+def resize_attention_map(attentions: tf.Tensor, height: int, width: int, align_corners: bool = False) -> tf.Tensor:
+ """
+ Args:
+ attentions (`tf.Tensor`): attention map of shape [batch_size, groups, feat_height*feat_width]
+ height (`int`): height of the output attention map
+ width (`int`): width of the output attention map
+ align_corners (`bool`, *optional*): the `align_corner` argument for `nn.functional.interpolate`.
+
+ Returns:
+ `tf.Tensor`: resized attention map of shape [batch_size, groups, height, width]
+ """
+
+ scale = (height * width // attentions.shape[2]) ** 0.5
+ if height > width:
+ feat_width = int(np.round(width / scale))
+ feat_height = shape_list(attentions)[2] // feat_width
+ else:
+ feat_height = int(np.round(height / scale))
+ feat_width = shape_list(attentions)[2] // feat_height
+
+ batch_size = shape_list(attentions)[0]
+ groups = shape_list(attentions)[1] # number of group token
+ # [batch_size, groups, height x width, groups] -> [batch_size, groups, height, width]
+ attentions = tf.reshape(attentions, (batch_size, groups, feat_height, feat_width))
+ attentions = tf.transpose(attentions, perm=(0, 2, 3, 1))
+ if align_corners:
+ attentions = tf.compat.v1.image.resize(
+ attentions,
+ size=(height, width),
+ method="bilinear",
+ align_corners=align_corners,
+ )
+ else:
+ attentions = tf.image.resize(attentions, size=(height, width), method="bilinear")
+ attentions = tf.transpose(attentions, perm=(0, 3, 1, 2))
+ return attentions
+
+
+def get_grouping_from_attentions(attentions: Tuple[tf.Tensor], hw_shape: Tuple[int]) -> tf.Tensor:
+ """
+ Args:
+ attentions (`tuple(tf.Tensor)`: tuple of attention maps returned by `TFGroupViTVisionTransformer`
+ hw_shape (`tuple(int)`): height and width of the output attention map
+ Returns:
+ `tf.Tensor`: the attention map of shape [batch_size, groups, height, width]
+ """
+
+ attn_maps = []
+ prev_attn_masks = None
+ for attn_masks in attentions:
+ # [batch_size, num_groups, height x width] -> [batch_size, height x width, num_groups]
+ attn_masks = tf.transpose(attn_masks, perm=(0, 2, 1))
+ if prev_attn_masks is None:
+ prev_attn_masks = attn_masks
+ else:
+ prev_attn_masks = tf.matmul(prev_attn_masks, attn_masks)
+ # [batch_size, height x width, num_groups] -> [batch_size, num_groups, height x width] -> [batch_size, num_groups, height, width]
+ cur_attn_map = resize_attention_map(tf.transpose(prev_attn_masks, perm=(0, 2, 1)), *hw_shape)
+ attn_maps.append(cur_attn_map)
+
+ # [batch_size, num_groups, height, width]
+ final_grouping = attn_maps[-1]
+
+ return tf.stop_gradient(final_grouping)
+
+
+@dataclass
+class TFGroupViTModelOutput(ModelOutput):
+ """
+ Args:
+ loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
+ Contrastive loss for image-text similarity.
+ logits_per_image (`tf.Tensor` of shape `(image_batch_size, text_batch_size)`):
+ The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
+ similarity scores.
+ logits_per_text (`tf.Tensor` of shape `(text_batch_size, image_batch_size)`):
+ The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
+ similarity scores.
+ segmentation_logits (`tf.Tensor` of shape `(batch_size, config.num_labels, logits_height, logits_width)`):
+ Classification scores for each pixel.
+
+
+
+ The logits returned do not necessarily have the same size as the `pixel_values` passed as inputs. This is
+ to avoid doing two interpolations and lose some quality when a user needs to resize the logits to the
+ original image size as post-processing. You should always check your logits shape and resize as needed.
+
+
+
+ text_embeds (`tf.Tensor` of shape `(batch_size, output_dim`):
+ The text embeddings obtained by applying the projection layer to the pooled output of
+ [`TFGroupViTTextModel`].
+ image_embeds (`tf.Tensor` of shape `(batch_size, output_dim`):
+ The image embeddings obtained by applying the projection layer to the pooled output of
+ [`TFGroupViTVisionModel`].
+ text_model_output (`TFBaseModelOutputWithPooling`):
+ The output of the [`TFGroupViTTextModel`].
+ vision_model_output (`TFBaseModelOutputWithPooling`):
+ The output of the [`TFGroupViTVisionModel`].
+ """
+
+ loss: Optional[tf.Tensor] = None
+ logits_per_image: tf.Tensor = None
+ logits_per_text: tf.Tensor = None
+ segmentation_logits: tf.Tensor = None
+ text_embeds: tf.Tensor = None
+ image_embeds: tf.Tensor = None
+ text_model_output: TFBaseModelOutputWithPooling = None
+ vision_model_output: TFBaseModelOutputWithPooling = None
+
+ def to_tuple(self) -> Tuple[Any]:
+ return tuple(
+ self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
+ for k in self.keys()
+ )
+
+
+class TFGroupViTCrossAttentionLayer(tf.keras.layers.Layer):
+ def __init__(self, config: GroupViTVisionConfig, **kwargs):
+ super().__init__(**kwargs)
+ self.attn = TFGroupViTAttention(config, name="attn")
+ self.norm2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="norm2")
+ self.mlp = TFGroupViTMLP(config, name="mlp")
+ self.norm_post = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="norm_post")
+
+ def call(self, query: tf.Tensor, key: tf.Tensor, training: bool = False) -> tf.Tensor:
+ x = query
+ x = x + self.attn(query, encoder_hidden_states=key)[0]
+ x = x + self.mlp(self.norm2(x))
+ x = self.norm_post(x)
+ return x
+
+
+class TFGroupViTAssignAttention(tf.keras.layers.Layer):
+ def __init__(self, config: GroupViTVisionConfig, **kwargs):
+ super().__init__(**kwargs)
+ self.scale = config.hidden_size**-0.5
+
+ self.q_proj = tf.keras.layers.Dense(config.hidden_size, name="q_proj")
+ self.k_proj = tf.keras.layers.Dense(config.hidden_size, name="k_proj")
+ self.v_proj = tf.keras.layers.Dense(config.hidden_size, name="v_proj")
+ self.proj = tf.keras.layers.Dense(config.hidden_size, name="proj")
+ self.assign_eps = config.assign_eps
+
+ def get_attn(self, attn: tf.Tensor, gumbel: bool = True, hard: bool = True, training: bool = False) -> tf.Tensor:
+
+ if gumbel and training:
+ attn = gumbel_softmax(attn, dim=-2, hard=hard)
+ else:
+ if hard:
+ attn = hard_softmax(attn, dim=-2)
+ else:
+ attn = stable_softmax(attn, axis=-2)
+
+ return attn
+
+ def call(self, query: tf.Tensor, key: tf.Tensor, training: bool = False):
+ value = key
+ # [batch_size, query_length, channels]
+ query = self.q_proj(query)
+
+ # [batch_size, key_length, channels]
+ key = self.k_proj(key)
+
+ # [batch_size, key_length, channels]
+ value = self.v_proj(value)
+
+ # [batch_size, query_length, key_length]
+ raw_attn = tf.matmul(query, key, transpose_b=True) * self.scale
+
+ attn = self.get_attn(raw_attn, training=training)
+ soft_attn = self.get_attn(raw_attn, training=training, gumbel=False, hard=False)
+
+ attn = attn / (tf.math.reduce_sum(attn, axis=-1, keepdims=True) + self.assign_eps)
+
+ out = tf.matmul(attn, value)
+
+ out = self.proj(out)
+
+ return out, soft_attn
+
+
+class TFGroupViTTokenAssign(tf.keras.layers.Layer):
+ def __init__(self, config: GroupViTVisionConfig, num_group_token: int, num_output_group: int, **kwargs):
+ super().__init__(**kwargs)
+ self.num_output_group = num_output_group
+ # norm on group_tokens
+ self.norm_tokens = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="norm_tokens")
+ assign_mlp_ratio = (
+ config.assign_mlp_ratio
+ if isinstance(config.assign_mlp_ratio, collections.abc.Iterable)
+ else (config.assign_mlp_ratio, config.assign_mlp_ratio)
+ )
+ tokens_dim, channels_dim = [int(x * config.hidden_size) for x in assign_mlp_ratio]
+ self.mlp_inter = TFGroupViTMixerMLP(config, num_group_token, tokens_dim, num_output_group, name="mlp_inter")
+ self.norm_post_tokens = tf.keras.layers.LayerNormalization(
+ epsilon=config.layer_norm_eps, name="norm_post_tokens"
+ )
+ # norm on x
+ self.norm_x = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="norm_x")
+ self.pre_assign_attn = TFGroupViTCrossAttentionLayer(config, name="pre_assign_attn")
+
+ self.assign = TFGroupViTAssignAttention(config, name="assign")
+ self.norm_new_x = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="norm_new_x")
+ self.mlp_channels = TFGroupViTMLP(
+ config, config.hidden_size, channels_dim, config.hidden_size, name="mlp_channels"
+ )
+
+ def project_group_token(self, group_tokens: tf.Tensor) -> tf.Tensor:
+ """
+ Args:
+ group_tokens (tf.Tensor): group tokens, [batch_size, num_group_tokens, channels]
+
+ Returns:
+ projected_group_tokens (tf.Tensor): [batch_size, num_output_groups, channels]
+ """
+ # [B, num_output_groups, C] <- [B, num_group_tokens, C]
+ projected_group_tokens = self.mlp_inter(group_tokens)
+ projected_group_tokens = self.norm_post_tokens(projected_group_tokens)
+ return projected_group_tokens
+
+ def call(self, image_tokens: tf.Tensor, group_tokens: tf.Tensor, training: bool = False):
+ """
+ Args:
+ image_tokens (`tf.Tensor`): image tokens, of shape [batch_size, input_length, channels]
+ group_tokens (`tf.Tensor`): group tokens, [batch_size, num_group_tokens, channels]
+ """
+
+ group_tokens = self.norm_tokens(group_tokens)
+ image_tokens = self.norm_x(image_tokens)
+ # [batch_size, num_output_groups, channels]
+ projected_group_tokens = self.project_group_token(group_tokens)
+ projected_group_tokens = self.pre_assign_attn(projected_group_tokens, image_tokens)
+ new_image_tokens, attention = self.assign(projected_group_tokens, image_tokens)
+ new_image_tokens += projected_group_tokens
+
+ new_image_tokens = new_image_tokens + self.mlp_channels(self.norm_new_x(new_image_tokens))
+
+ return new_image_tokens, attention
+
+
+# Adapted from transformers.models.vit.modeling_tf_vit.TFViTPatchEmbeddings with ViT->GroupViT
+class TFGroupViTPatchEmbeddings(tf.keras.layers.Layer):
+ """
+ This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+ `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+ Transformer.
+ """
+
+ def __init__(self, config: GroupViTConfig, **kwargs):
+ super().__init__(**kwargs)
+ image_size, patch_size = config.image_size, config.patch_size
+ num_channels = config.num_channels
+ # hidden_size is a member as it will be required in the call method
+ self.hidden_size = config.hidden_size
+
+ image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+ patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+ num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+ self.image_size = image_size
+ self.patch_size = patch_size
+ self.num_patches = num_patches
+ self.num_channels = num_channels
+ self.config = config
+
+ self.projection = tf.keras.layers.Conv2D(
+ filters=self.hidden_size,
+ kernel_size=patch_size,
+ strides=patch_size,
+ padding="valid",
+ data_format="channels_last",
+ use_bias=True,
+ kernel_initializer=get_initializer(self.config.initializer_range),
+ bias_initializer="zeros",
+ name="projection",
+ )
+
+ def call(
+ self, pixel_values: tf.Tensor, interpolate_pos_encoding: bool = False, training: bool = False
+ ) -> tf.Tensor:
+ batch_size, num_channels, height, width = shape_list(pixel_values)
+ if tf.executing_eagerly() and num_channels != self.num_channels:
+ raise ValueError(
+ "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+ )
+ if (
+ not interpolate_pos_encoding
+ and tf.executing_eagerly()
+ and (height != self.image_size[0] or width != self.image_size[1])
+ ):
+ raise ValueError(
+ f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
+ )
+
+ # When running on CPU, `tf.keras.layers.Conv2D` doesn't support `NCHW` format.
+ # So change the input format from `NCHW` to `NHWC`.
+ # shape = (batch_size, in_height, in_width, in_channels=num_channels)
+ pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))
+
+ projection = self.projection(pixel_values)
+
+ # Change the 2D spatial dimensions to a single temporal dimension.
+ # shape = (batch_size, num_patches, out_channels=embed_dim)
+ num_patches = (width // self.patch_size[1]) * (height // self.patch_size[0])
+ # In the TFGroupViTVisionEmbeddings the embeddings from this layer will be layer normalized
+ # LayerNormalization layer needs to have static last dimension (otherwise the test_keras_save_load fails with symbolic tensors)
+ # This is why we have used the hidden_size in the reshape method
+ embeddings = tf.reshape(tensor=projection, shape=(batch_size, num_patches, self.hidden_size))
+
+ return embeddings
+
+
+# Adapted from transformers.vit.modeling_tf_vit.TFViTEmbeddings
+class TFGroupViTVisionEmbeddings(tf.keras.layers.Layer):
+ """
+ Construct the position and patch embeddings.
+
+ """
+
+ def __init__(self, config: GroupViTVisionConfig, **kwargs):
+ super().__init__(**kwargs)
+
+ self.patch_embeddings = TFGroupViTPatchEmbeddings(config, name="patch_embeddings")
+ self.dropout = tf.keras.layers.Dropout(rate=config.dropout, name="dropout")
+ self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
+ self.config = config
+
+ def build(self, input_shape: tf.TensorShape):
+
+ num_patches = self.patch_embeddings.num_patches
+ self.position_embeddings = self.add_weight(
+ shape=(1, num_patches, self.config.hidden_size),
+ initializer="zeros",
+ trainable=True,
+ name="position_embeddings",
+ )
+
+ super().build(input_shape)
+
+ def interpolate_pos_encoding(self, embeddings, height, width) -> tf.Tensor:
+ """
+ This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
+ resolution images.
+
+ Source:
+ https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
+ """
+
+ batch_size, num_patches, dim = shape_list(embeddings)
+ num_positions = shape_list(self.position_embeddings)[1]
+
+ if num_patches == num_positions and height == width:
+ return self.position_embeddings
+ patch_pos_embed = self.position_embeddings
+ h0 = height // self.config.patch_size
+ w0 = width // self.config.patch_size
+ patch_pos_embed = tf.image.resize(
+ images=tf.reshape(
+ patch_pos_embed, shape=(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim)
+ ),
+ size=(h0, w0),
+ method="bicubic",
+ )
+ patch_pos_embed = tf.reshape(tensor=patch_pos_embed, shape=(1, -1, dim))
+ return patch_pos_embed
+
+ def call(
+ self, pixel_values: tf.Tensor, interpolate_pos_encoding: bool = False, training: bool = False
+ ) -> tf.Tensor:
+ _, _, height, width = shape_list(pixel_values)
+ embeddings = self.patch_embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
+ embeddings = self.layernorm(embeddings)
+
+ # add positional encoding to each token
+ if interpolate_pos_encoding:
+ embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+ else:
+ embeddings = embeddings + self.position_embeddings
+
+ embeddings = self.dropout(embeddings)
+
+ return embeddings
+
+
+# Copied from transformers.models.clip.modeling_tf_clip.TFCLIPTextEmbeddings with CLIP->GroupViT
+class TFGroupViTTextEmbeddings(tf.keras.layers.Layer):
+ def __init__(self, config: GroupViTTextConfig, **kwargs):
+ super().__init__(**kwargs)
+
+ self.embed_dim = config.hidden_size
+ self.vocab_size = config.vocab_size
+
+ self.config = config
+
+ def build(self, input_shape: tf.TensorShape):
+
+ with tf.name_scope("token_embedding"):
+ self.weight = self.add_weight(
+ shape=(self.vocab_size, self.embed_dim),
+ initializer=get_initializer(self.config.initializer_factor * self.config.initializer_range),
+ trainable=True,
+ name="weight",
+ )
+
+ with tf.name_scope("position_embedding"):
+ self.position_embedding = self.add_weight(
+ shape=(self.config.max_position_embeddings, self.embed_dim),
+ initializer=get_initializer(self.config.initializer_factor * self.config.initializer_range),
+ trainable=True,
+ name="embeddings",
+ )
+
+ super().build(input_shape)
+
+ def call(
+ self,
+ input_ids: tf.Tensor = None,
+ position_ids: tf.Tensor = None,
+ inputs_embeds: tf.Tensor = None,
+ ) -> tf.Tensor:
+ """
+ Applies embedding based on inputs tensor.
+
+ Returns:
+ final_embeddings (`tf.Tensor`): output embedding tensor.
+ """
+ if input_ids is None and inputs_embeds is None:
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+ if inputs_embeds is None:
+ # Note: tf.gather, on which the embedding layer is based, won't check positive out of bound
+ # indices on GPU, returning zeros instead. This is a dangerous silent behavior.
+ tf.debugging.assert_less(
+ input_ids,
+ tf.cast(self.vocab_size, dtype=input_ids.dtype),
+ message=(
+ "input_ids must be smaller than the embedding layer's input dimension (got"
+ f" {tf.math.reduce_max(input_ids)} >= {self.vocab_size})"
+ ),
+ )
+ inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
+
+ input_shape = shape_list(inputs_embeds)[:-1]
+
+ if position_ids is None:
+ position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0)
+
+ position_embeds = tf.gather(params=self.position_embedding, indices=position_ids)
+ position_embeds = tf.tile(input=position_embeds, multiples=(input_shape[0], 1, 1))
+ final_embeddings = inputs_embeds + position_embeds
+
+ return final_embeddings
+
+
+class TFGroupViTStage(tf.keras.layers.Layer):
+ """This corresponds to the `GroupingLayer` class in the GroupViT implementation."""
+
+ def __init__(
+ self,
+ config: GroupViTVisionConfig,
+ depth: int,
+ num_prev_group_token: int,
+ num_group_token: int,
+ num_output_group: int,
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+ self.config = config
+ self.depth = depth
+ self.num_group_token = num_group_token
+ self.layers = [TFGroupViTEncoderLayer(config, name=f"layers_._{i}") for i in range(depth)]
+
+ if num_group_token > 0:
+ self.downsample = TFGroupViTTokenAssign(
+ config=config,
+ num_group_token=num_group_token,
+ num_output_group=num_output_group,
+ name="downsample",
+ )
+ else:
+ self.downsample = None
+
+ if num_prev_group_token > 0 and num_group_token > 0:
+ self.group_projector = [
+ tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="group_projector.0"),
+ TFGroupViTMixerMLP(
+ config, num_prev_group_token, config.hidden_size // 2, num_group_token, name="group_projector.1"
+ ),
+ ]
+ else:
+ self.group_projector = None
+
+ def build(self, input_shape: tf.TensorShape):
+ if self.num_group_token > 0:
+ self.group_token = self.add_weight(
+ shape=(1, self.num_group_token, self.config.hidden_size),
+ initializer="zeros",
+ trainable=True,
+ name="group_token",
+ )
+ else:
+ self.group_token = None
+ super().build(input_shape)
+
+ @property
+ def with_group_token(self):
+ return self.group_token is not None
+
+ def split_x(self, x: tf.Tensor) -> tf.Tensor:
+ if self.with_group_token:
+ return x[:, : -self.num_group_token], x[:, -self.num_group_token :]
+ else:
+ return x, None
+
+ def concat_x(self, x: tf.Tensor, group_token: Optional[tf.Tensor] = None) -> tf.Tensor:
+ if group_token is None:
+ return x
+ return tf.concat([x, group_token], axis=1)
+
+ def call(
+ self,
+ hidden_states: tf.Tensor,
+ prev_group_token: Optional[tf.Tensor] = None,
+ output_attentions: bool = False,
+ training: bool = False,
+ ) -> Tuple[tf.Tensor]:
+ """
+ Args:
+ hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+ attention_mask (`tf.Tensor`): attention mask of size
+ `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+ `(config.encoder_attention_heads,)`.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the grouping tensors of Grouping block.
+ """
+ if self.with_group_token:
+ group_token = tf.tile(self.group_token, multiples=(shape_list(hidden_states)[0], 1, 1))
+ if self.group_projector is not None:
+ for layer in self.group_projector:
+ prev_group_token = layer(prev_group_token)
+ group_token = group_token + prev_group_token
+ else:
+ group_token = None
+
+ x = hidden_states
+
+ cat_x = self.concat_x(x, group_token)
+ for layer in self.layers:
+ layer_out = layer(
+ cat_x,
+ attention_mask=None,
+ causal_attention_mask=None,
+ output_attentions=None,
+ )
+ cat_x = layer_out[0]
+
+ x, group_token = self.split_x(cat_x)
+
+ attention = None
+ if self.downsample is not None:
+ x, attention = self.downsample(x, group_token)
+
+ outputs = (x, group_token)
+ if output_attentions:
+ outputs = outputs + (attention,)
+
+ return outputs
+
+
+class TFGroupViTMLP(tf.keras.layers.Layer):
+ def __init__(
+ self,
+ config: GroupViTVisionConfig,
+ hidden_size: Optional[int] = None,
+ intermediate_size: Optional[int] = None,
+ output_size: Optional[int] = None,
+ **kwargs,
+ ):
+ super().__init__(**kwargs)
+ self.config = config
+ self.activation_fn = get_tf_activation(config.hidden_act)
+ hidden_size = hidden_size if hidden_size is not None else config.hidden_size
+ intermediate_size = intermediate_size if intermediate_size is not None else config.intermediate_size
+ output_size = output_size if output_size is not None else hidden_size
+ self.fc1 = tf.keras.layers.Dense(intermediate_size, name="fc1")
+ self.fc2 = tf.keras.layers.Dense(output_size, name="fc2")
+
+ def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
+ hidden_states = self.fc1(hidden_states)
+ hidden_states = self.activation_fn(hidden_states)
+ hidden_states = self.fc2(hidden_states)
+ return hidden_states
+
+
+class TFGroupViTMixerMLP(TFGroupViTMLP):
+ def call(self, x, training: bool = False):
+ x = super().call(hidden_states=tf.transpose(x, perm=(0, 2, 1)))
+ return tf.transpose(x, perm=(0, 2, 1))
+
+
+# Adapted from transformers.models.clip.modeling_tf_clip.TFCLIPAttention
+class TFGroupViTAttention(tf.keras.layers.Layer):
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+ def __init__(self, config: GroupViTConfig, **kwargs):
+ super().__init__(**kwargs)
+
+ self.embed_dim = config.hidden_size
+ self.num_attention_heads = config.num_attention_heads
+ self.attention_head_size = self.embed_dim // self.num_attention_heads
+ if self.attention_head_size * self.num_attention_heads != self.embed_dim:
+ raise ValueError(
+ f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+ f" {self.num_attention_heads})."
+ )
+
+ factor = config.initializer_factor
+ in_proj_std = (self.embed_dim**-0.5) * ((2 * config.num_hidden_layers) ** -0.5) * factor
+ out_proj_std = (self.embed_dim**-0.5) * factor
+
+ self.sqrt_att_head_size = math.sqrt(self.attention_head_size)
+
+ self.q_proj = tf.keras.layers.Dense(
+ units=self.embed_dim, kernel_initializer=get_initializer(in_proj_std), name="q_proj"
+ )
+ self.k_proj = tf.keras.layers.Dense(
+ units=self.embed_dim, kernel_initializer=get_initializer(in_proj_std), name="k_proj"
+ )
+ self.v_proj = tf.keras.layers.Dense(
+ units=self.embed_dim, kernel_initializer=get_initializer(in_proj_std), name="v_proj"
+ )
+
+ self.dropout = tf.keras.layers.Dropout(rate=config.attention_dropout)
+
+ self.out_proj = tf.keras.layers.Dense(
+ units=self.embed_dim, kernel_initializer=get_initializer(out_proj_std), name="out_proj"
+ )
+
+ # Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention.transpose_for_scores
+ def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
+ # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
+ tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size))
+
+ # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] to [batch_size, num_attention_heads, seq_length, attention_head_size]
+ return tf.transpose(tensor, perm=[0, 2, 1, 3])
+
+ def call(
+ self,
+ hidden_states: tf.Tensor,
+ attention_mask: tf.Tensor = None,
+ causal_attention_mask: tf.Tensor = None,
+ output_attentions: bool = None,
+ encoder_hidden_states: tf.Tensor = None,
+ training: bool = False,
+ ) -> Tuple[tf.Tensor]:
+ """Input shape: Batch x Time x Channel"""
+
+ batch_size = shape_list(hidden_states)[0]
+ is_cross_attention = encoder_hidden_states is not None
+
+ mixed_query_layer = self.q_proj(inputs=hidden_states)
+ if is_cross_attention:
+ mixed_key_layer = self.k_proj(inputs=encoder_hidden_states)
+ mixed_value_layer = self.v_proj(inputs=encoder_hidden_states)
+ else:
+ mixed_key_layer = self.k_proj(inputs=hidden_states)
+ mixed_value_layer = self.v_proj(inputs=hidden_states)
+
+ query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
+ key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)
+ value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)
+
+ # Take the dot product between "query" and "key" to get the raw attention scores.
+ # (batch size, num_heads, seq_len_q, seq_len_k)
+ attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
+ dk = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype)
+ attention_scores = tf.divide(attention_scores, dk)
+
+ # apply the causal_attention_mask first
+ if causal_attention_mask is not None:
+ # Apply the causal attention mask (precomputed for all layers in TFCLIPModel call() function)
+ attention_scores = tf.add(attention_scores, causal_attention_mask)
+
+ if attention_mask is not None:
+ # Apply the attention mask (precomputed for all layers in TFCLIPModel call() function)
+ attention_scores = tf.add(attention_scores, attention_mask)
+
+ # Normalize the attention scores to probabilities.
+ _attention_probs = stable_softmax(logits=attention_scores, axis=-1)
+
+ # This is actually dropping out entire tokens to attend to, which might
+ # seem a bit unusual, but is taken from the original Transformer paper.
+ attention_probs = self.dropout(inputs=_attention_probs)
+
+ attention_output = tf.matmul(attention_probs, value_layer)
+ attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3])
+
+ # (batch_size, seq_len_q, embed_dim)
+ attention_output = tf.reshape(tensor=attention_output, shape=(batch_size, -1, self.embed_dim))
+
+ attention_output = self.out_proj(attention_output)
+ # In TFBert, attention weights are returned after dropout.
+ # However, in CLIP, they are returned before dropout.
+ outputs = (attention_output, _attention_probs) if output_attentions else (attention_output,)
+
+ return outputs
+
+
+# Copied from transformers.models.clip.modeling_tf_clip.TFCLIPEncoderLayer with CLIP->GroupViT
+class TFGroupViTEncoderLayer(tf.keras.layers.Layer):
+ def __init__(self, config: GroupViTConfig, **kwargs):
+ super().__init__(**kwargs)
+
+ self.embed_dim = config.hidden_size
+ self.self_attn = TFGroupViTAttention(config, name="self_attn")
+ self.layer_norm1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm1")
+ self.mlp = TFGroupViTMLP(config, name="mlp")
+ self.layer_norm2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm2")
+
+ def call(
+ self,
+ hidden_states: tf.Tensor,
+ attention_mask: tf.Tensor,
+ causal_attention_mask: tf.Tensor,
+ output_attentions: bool,
+ training: bool = False,
+ ) -> Tuple[tf.Tensor]:
+ """
+ Args:
+ hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+ attention_mask (`tf.Tensor`): attention mask of size
+ `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+ causal_attention_mask (`tf.Tensor`): causal attention mask of size
+ `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+ output_attentions (`bool`):
+ Whether or not to return the attentions tensors of all attention layers. See `outputs` under returned
+ tensors for more detail.
+ """
+ residual = hidden_states
+
+ hidden_states = self.layer_norm1(inputs=hidden_states)
+ attention_outputs = self.self_attn(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ causal_attention_mask=causal_attention_mask,
+ output_attentions=output_attentions,
+ training=training,
+ )
+ hidden_states = attention_outputs[0]
+ hidden_states = residual + hidden_states
+
+ residual = hidden_states
+ hidden_states = self.layer_norm2(inputs=hidden_states)
+ hidden_states = self.mlp(hidden_states=hidden_states)
+ hidden_states = residual + hidden_states
+
+ outputs = (hidden_states,) + attention_outputs[1:] # add attentions if we output them
+
+ return outputs
+
+
+# Adapted from transformers.models.clip.modeling_tf_clip.TFGroupViTTextEncoder
+class TFGroupViTTextEncoder(tf.keras.layers.Layer):
+ def __init__(self, config: GroupViTTextConfig, **kwargs):
+ super().__init__(**kwargs)
+
+ self.layers = [TFGroupViTEncoderLayer(config, name=f"layers_._{i}") for i in range(config.num_hidden_layers)]
+
+ def call(
+ self,
+ hidden_states,
+ attention_mask: tf.Tensor,
+ causal_attention_mask: tf.Tensor,
+ output_attentions: bool,
+ output_hidden_states: bool,
+ return_dict: bool,
+ training: bool = False,
+ ) -> Union[Tuple, TFBaseModelOutput]:
+ encoder_states = () if output_hidden_states else None
+ all_attentions = () if output_attentions else None
+
+ for idx, encoder_layer in enumerate(self.layers):
+ if output_hidden_states:
+ encoder_states = encoder_states + (hidden_states,)
+
+ layer_outputs = encoder_layer(
+ hidden_states,
+ attention_mask,
+ causal_attention_mask,
+ output_attentions=output_attentions,
+ )
+ hidden_states = layer_outputs[0]
+
+ if output_attentions:
+ all_attentions = all_attentions + (layer_outputs[1],)
+
+ if output_hidden_states:
+ encoder_states = encoder_states + (hidden_states,)
+
+ if not return_dict:
+ return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+ return TFBaseModelOutput(
+ last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+ )
+
+
+class TFGroupViTVisionEncoder(tf.keras.layers.Layer):
+ def __init__(self, config: GroupViTVisionConfig, **kwargs) -> None:
+ super().__init__(**kwargs)
+
+ self.stages = [
+ TFGroupViTStage(
+ config=config,
+ depth=config.depths[i],
+ num_group_token=config.num_group_tokens[i],
+ num_output_group=config.num_output_groups[i],
+ num_prev_group_token=config.num_output_groups[i - 1] if i > 0 else 0,
+ name=f"stages_._{i}",
+ )
+ for i in range(len(config.depths))
+ ]
+
+ def call(
+ self,
+ hidden_states: tf.Tensor,
+ output_hidden_states: bool,
+ output_attentions: bool,
+ return_dict: bool,
+ training: bool = False,
+ ) -> Union[tuple, TFBaseModelOutput]:
+ all_hidden_states = () if output_hidden_states else None
+ all_groupings = () if output_attentions else None
+
+ group_tokens = None
+
+ for stage in self.stages:
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ layer_outputs = stage(hidden_states, group_tokens, output_attentions)
+
+ hidden_states = layer_outputs[0]
+ group_tokens = layer_outputs[1]
+
+ if output_attentions and layer_outputs[2] is not None:
+ all_groupings = all_groupings + (layer_outputs[2],)
+
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ if not return_dict:
+ return tuple(v for v in [hidden_states, all_hidden_states, all_groupings] if v is not None)
+ return TFBaseModelOutput(
+ last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_groupings
+ )
+
+
+# Copied from transformers.models.clip.modeling_tf_clip.TFCLIPTextTransformer with CLIPText->GroupViTText, CLIPEncoder->GroupViTTextEncoder
+class TFGroupViTTextTransformer(tf.keras.layers.Layer):
+ def __init__(self, config: GroupViTTextConfig, **kwargs):
+ super().__init__(**kwargs)
+
+ self.embeddings = TFGroupViTTextEmbeddings(config, name="embeddings")
+ self.encoder = TFGroupViTTextEncoder(config, name="encoder")
+ self.final_layer_norm = tf.keras.layers.LayerNormalization(
+ epsilon=config.layer_norm_eps, name="final_layer_norm"
+ )
+
+ def call(
+ self,
+ input_ids: TFModelInputType,
+ attention_mask: tf.Tensor,
+ position_ids: tf.Tensor,
+ output_attentions: bool,
+ output_hidden_states: bool,
+ return_dict: bool,
+ training: bool = False,
+ ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
+ input_shape = shape_list(input_ids)
+
+ embedding_output = self.embeddings(input_ids=input_ids, position_ids=position_ids)
+
+ batch_size, seq_length = input_shape
+ # CLIP's text model uses causal mask, prepare it here.
+ # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
+ causal_attention_mask = self._build_causal_attention_mask(batch_size, seq_length, dtype=embedding_output.dtype)
+
+ # check attention mask and invert
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+ attention_mask = _expand_mask(attention_mask)
+
+ encoder_outputs = self.encoder(
+ hidden_states=embedding_output,
+ attention_mask=attention_mask,
+ causal_attention_mask=causal_attention_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ training=training,
+ )
+
+ sequence_output = encoder_outputs[0]
+ sequence_output = self.final_layer_norm(inputs=sequence_output)
+
+ # text_embeds.shape = [batch_size, n_ctx, transformer.width]
+ # take features from the eot embedding (eot_token is the highest number in each sequence)
+ pooled_output = tf.gather_nd(
+ params=sequence_output,
+ indices=tf.stack(
+ values=(tf.range(input_shape[0], dtype=tf.int64), tf.math.argmax(input_ids, axis=-1)), axis=1
+ ),
+ )
+
+ if not return_dict:
+ return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+ return TFBaseModelOutputWithPooling(
+ last_hidden_state=sequence_output,
+ pooler_output=pooled_output,
+ hidden_states=encoder_outputs.hidden_states,
+ attentions=encoder_outputs.attentions,
+ )
+
+ def _build_causal_attention_mask(self, batch_size, seq_length, dtype=tf.float32):
+ # It is possible with an unspecified sequence length for seq_length to be
+ # a runtime value, which is unsupported by tf.constant. Per the TensorFlow
+ # docs, tf.fill can handle runtime dynamic shapes:
+ # https://www.tensorflow.org/api_docs/python/tf/fill
+ diag = tf.cast(tf.fill((seq_length,), 0.0), dtype)
+
+ # set an additive 2D attention mask with all places being masked
+ to_mask = tf.cast(tf.fill((seq_length, seq_length), -10000.0), dtype)
+
+ # set diagonal & lower triangular parts to 0 (i.e. the places not to be masked)
+ # TIP: think the 2D matrix as the space of (query_seq, key_seq)
+ to_mask = tf.linalg.band_part(to_mask, 0, -1)
+ # to_mask = tf.linalg.band_part(to_mask, -1, 0)
+ to_mask = tf.linalg.set_diag(to_mask, diagonal=diag)
+
+ return tf.broadcast_to(input=to_mask, shape=(batch_size, 1, seq_length, seq_length))
+
+
+# Adapted from transformers.models.clip.modeling_tf_clip.TFCLIPVisionTransformer
+class TFGroupViTVisionTransformer(tf.keras.layers.Layer):
+ def __init__(self, config: GroupViTVisionConfig, **kwargs):
+ super().__init__(**kwargs)
+
+ self.embeddings = TFGroupViTVisionEmbeddings(config, name="embeddings")
+ self.encoder = TFGroupViTVisionEncoder(config, name="encoder")
+ self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
+
+ def call(
+ self,
+ pixel_values: TFModelInputType,
+ output_attentions: bool,
+ output_hidden_states: bool,
+ return_dict: bool,
+ training: bool = False,
+ ) -> Union[Tuple, TFBaseModelOutputWithPooling]:
+
+ embedding_output = self.embeddings(pixel_values)
+
+ encoder_outputs = self.encoder(
+ hidden_states=embedding_output,
+ output_hidden_states=output_hidden_states,
+ output_attentions=output_attentions,
+ return_dict=return_dict,
+ )
+
+ last_hidden_state = encoder_outputs[0]
+
+ # normalize the last hidden state
+ last_hidden_state = self.layernorm(last_hidden_state)
+ pooled_output = tf.math.reduce_mean(last_hidden_state, axis=1)
+
+ if not return_dict:
+ return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+ return TFBaseModelOutputWithPooling(
+ last_hidden_state=last_hidden_state,
+ pooler_output=pooled_output,
+ hidden_states=encoder_outputs.hidden_states,
+ attentions=encoder_outputs.attentions,
+ )
+
+
+@keras_serializable
+# Copied from transformers.models.clip.modeling_tf_clip.TFCLIPTextMainLayer with CLIP->GroupViT
+class TFGroupViTTextMainLayer(tf.keras.layers.Layer):
+ config_class = GroupViTTextConfig
+
+ def __init__(self, config: GroupViTTextConfig, **kwargs):
+ super().__init__(**kwargs)
+ self.config = config
+ self.text_model = TFGroupViTTextTransformer(config, name="text_model")
+
+ def get_input_embeddings(self) -> tf.keras.layers.Layer:
+ return self.text_model.embeddings
+
+ def set_input_embeddings(self, value: tf.Variable):
+ self.text_model.embeddings.weight = value
+ self.text_model.embeddings.vocab_size = shape_list(value)[0]
+
+ @unpack_inputs
+ def call(
+ self,
+ input_ids: Optional[TFModelInputType] = None,
+ attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+ position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ training: bool = False,
+ ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
+ if input_ids is None:
+ raise ValueError("You have to specify input_ids")
+
+ input_shape = shape_list(input_ids)
+
+ if attention_mask is None:
+ attention_mask = tf.fill(dims=input_shape, value=1)
+
+ text_model_outputs = self.text_model(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ training=training,
+ )
+
+ return text_model_outputs
+
+
+@keras_serializable
+# Copied from transformers.models.clip.modeling_tf_clip.TFCLIPVisionMainLayer with CLIP->GroupViT
+class TFGroupViTVisionMainLayer(tf.keras.layers.Layer):
+ config_class = GroupViTVisionConfig
+
+ def __init__(self, config: GroupViTVisionConfig, **kwargs):
+ super().__init__(**kwargs)
+ self.config = config
+ self.vision_model = TFGroupViTVisionTransformer(config, name="vision_model")
+
+ def get_input_embeddings(self) -> tf.keras.layers.Layer:
+ return self.vision_model.embeddings
+
+ @unpack_inputs
+ def call(
+ self,
+ pixel_values: Optional[TFModelInputType] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ training: bool = False,
+ ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
+
+ if pixel_values is None:
+ raise ValueError("You have to specify pixel_values")
+
+ vision_model_outputs = self.vision_model(
+ pixel_values=pixel_values,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ training=training,
+ )
+
+ return vision_model_outputs
+
+
+@keras_serializable
+# Adapted from transformers.models.clip.modeling_tf_clip.TFCLIPMainLayer
+class TFGroupViTMainLayer(tf.keras.layers.Layer):
+ config_class = GroupViTConfig
+
+ def __init__(self, config: GroupViTConfig, **kwargs):
+ super().__init__(**kwargs)
+
+ if not isinstance(config.text_config, GroupViTTextConfig):
+ raise ValueError(
+ "config.text_config is expected to be of type GroupViTTextConfig but is of type"
+ f" {type(config.text_config)}."
+ )
+
+ if not isinstance(config.vision_config, GroupViTVisionConfig):
+ raise ValueError(
+ "config.vision_config is expected to be of type GroupViTVisionConfig but is of type"
+ f" {type(config.vision_config)}."
+ )
+
+ self.config = config
+
+ text_config = config.text_config
+ vision_config = config.vision_config
+
+ self.projection_dim = config.projection_dim
+ self.projection_intermediate_dim = config.projection_intermediate_dim
+ self.text_embed_dim = text_config.hidden_size
+ self.vision_embed_dim = vision_config.hidden_size
+
+ self.text_model = TFGroupViTTextTransformer(text_config, name="text_model")
+ self.vision_model = TFGroupViTVisionTransformer(vision_config, name="vision_model")
+
+ self.visual_projection = [
+ tf.keras.layers.Dense(self.projection_intermediate_dim, name="visual_projection.0"),
+ tf.keras.layers.BatchNormalization(name="visual_projection.1", momentum=0.1, epsilon=1e-5),
+ tf.keras.layers.ReLU(name="visual_projection.2"),
+ tf.keras.layers.Dense(self.projection_dim, name="visual_projection.3"),
+ ]
+ self.text_projection = [
+ tf.keras.layers.Dense(self.projection_intermediate_dim, name="text_projection.0"),
+ tf.keras.layers.BatchNormalization(name="text_projection.1", momentum=0.1, epsilon=1e-5),
+ tf.keras.layers.ReLU(name="text_projection.2"),
+ tf.keras.layers.Dense(self.projection_dim, name="text_projection.3"),
+ ]
+
+ def build(self, input_shape: tf.TensorShape):
+
+ self.logit_scale = self.add_weight(
+ shape=(1,),
+ initializer=tf.keras.initializers.Constant(self.config.logit_scale_init_value),
+ trainable=True,
+ name="logit_scale",
+ )
+
+ super().build(input_shape)
+
+ @unpack_inputs
+ def get_text_features(
+ self,
+ input_ids: Optional[TFModelInputType] = None,
+ attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+ position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ training: bool = False,
+ ) -> tf.Tensor:
+
+ if input_ids is None:
+ raise ValueError("You have to specify either input_ids")
+
+ input_shape = shape_list(input_ids)
+
+ if attention_mask is None:
+ attention_mask = tf.fill(dims=input_shape, value=1)
+
+ text_outputs = self.text_model(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ training=training,
+ )
+
+ pooled_output = text_outputs[1]
+ for layer in self.text_projection:
+ pooled_output = layer(pooled_output)
+
+ text_features = pooled_output
+ return text_features
+
+ @unpack_inputs
+ def get_image_features(
+ self,
+ pixel_values: Optional[TFModelInputType] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ training: bool = False,
+ ) -> tf.Tensor:
+
+ if pixel_values is None:
+ raise ValueError("You have to specify pixel_values")
+
+ vision_outputs = self.vision_model(
+ pixel_values=pixel_values,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ training=training,
+ )
+
+ pooled_output = vision_outputs[1]
+ for layer in self.visual_projection:
+ pooled_output = layer(pooled_output)
+
+ image_features = pooled_output
+ return image_features
+
+ @unpack_inputs
+ def call(
+ self,
+ input_ids: Optional[TFModelInputType] = None,
+ pixel_values: Optional[TFModelInputType] = None,
+ attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+ position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+ return_loss: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ output_segmentation: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ training: bool = False,
+ ) -> Union[TFGroupViTModelOutput, Tuple[tf.Tensor]]:
+
+ if input_ids is None:
+ raise ValueError("You have to specify either input_ids")
+ if pixel_values is None:
+ raise ValueError("You have to specify pixel_values")
+
+ input_shape = shape_list(input_ids)
+
+ if attention_mask is None:
+ attention_mask = tf.fill(dims=input_shape, value=1)
+ if output_segmentation:
+ output_attentions = True
+ vision_outputs = self.vision_model(
+ pixel_values=pixel_values,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ training=training,
+ )
+
+ text_outputs = self.text_model(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ training=training,
+ )
+
+ image_embeds = vision_outputs[1]
+ for layer in self.visual_projection:
+ image_embeds = layer(image_embeds)
+
+ text_embeds = text_outputs[1]
+ for layer in self.text_projection:
+ text_embeds = layer(text_embeds)
+
+ # normalized features
+ image_embeds = image_embeds / tf.norm(image_embeds, axis=-1, keepdims=True)
+ text_embeds = text_embeds / tf.norm(text_embeds, axis=-1, keepdims=True)
+
+ # cosine similarity as logits
+ logit_scale = tf.math.exp(self.logit_scale)
+ logits_per_text = tf.matmul(text_embeds, image_embeds, transpose_b=True) * logit_scale
+ logits_per_image = tf.transpose(logits_per_text)
+
+ seg_logits = None
+ if output_segmentation:
+ # grouped features
+ # [batch_size_image, num_group, hidden_size]
+ image_group_embeds = vision_outputs[0]
+ # [batch_size_image*num_group, hidden_size]
+ image_group_embeds = tf.reshape(image_group_embeds, shape=(-1, shape_list(image_group_embeds)[-1]))
+ for layer in self.visual_projection:
+ image_group_embeds = layer(image_group_embeds)
+ if output_hidden_states:
+ attentions = vision_outputs[3]
+ else:
+ attentions = vision_outputs[2]
+ # [batch_size_image, num_group, height, width]
+ grouping = get_grouping_from_attentions(attentions, pixel_values.shape[2:])
+
+ # normalized features
+ image_group_embeds = image_group_embeds / tf.norm(
+ tensor=image_group_embeds, ord="euclidean", axis=-1, keepdims=True
+ )
+ # [batch_size_image x num_group, batch_size_text]
+ logits_per_image_group = tf.matmul(image_group_embeds, text_embeds, transpose_b=True) * logit_scale
+ # [batch_size_image, batch_size_text, num_group]
+ logits_per_image_group = tf.reshape(
+ logits_per_image_group, shape=(image_embeds.shape[0], -1, text_embeds.shape[0])
+ )
+ logits_per_image_group = tf.transpose(logits_per_image_group, perm=(0, 2, 1))
+
+ # [batch_size_image, batch_size_text, height x width]
+ flatten_grouping = tf.reshape(grouping, shape=(shape_list(grouping)[0], shape_list(grouping)[1], -1))
+
+ # [batch_size_image, batch_size_text, height, width]
+ seg_logits = tf.matmul(logits_per_image_group, flatten_grouping) * logit_scale
+ seg_logits = tf.reshape(
+ seg_logits, shape=(seg_logits.shape[0], seg_logits.shape[1], grouping.shape[2], grouping.shape[3])
+ )
+
+ loss = None
+ if return_loss:
+ loss = groupvit_loss(logits_per_text)[None, ...]
+
+ if not return_dict:
+ if seg_logits is not None:
+ output = (
+ logits_per_image,
+ logits_per_text,
+ seg_logits,
+ text_embeds,
+ image_embeds,
+ text_outputs,
+ vision_outputs,
+ )
+ else:
+ output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
+ return ((loss,) + output) if loss is not None else output
+
+ return TFGroupViTModelOutput(
+ loss=loss,
+ logits_per_image=logits_per_image,
+ logits_per_text=logits_per_text,
+ segmentation_logits=seg_logits,
+ text_embeds=text_embeds,
+ image_embeds=image_embeds,
+ text_model_output=text_outputs,
+ vision_model_output=vision_outputs,
+ )
+
+
+class TFGroupViTPreTrainedModel(TFPreTrainedModel):
+ """
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+ models.
+ """
+
+ config_class = GroupViTConfig
+ base_model_prefix = "groupvit"
+
+
+GROUPVIT_START_DOCSTRING = r"""
+ This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+ etc.)
+
+ This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+ as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+ behavior.
+
+
+
+ TF 2.0 models accepts two formats as inputs:
+
+ - having all inputs as keyword arguments (like PyTorch models), or
+ - having all inputs as a list, tuple or dict in the first positional arguments.
+
+ This second option is useful when using [`tf.keras.Model.fit`] method which currently requires having all the
+ tensors in the first argument of the model call function: `model(inputs)`.
+
+ If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the
+ first positional argument :
+
+ - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
+ - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+ `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+ - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+ `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+
+
+ Args:
+ config ([`GroupViTConfig`]): Model configuration class with all the parameters of the model.
+ Initializing with a config file does not load the weights associated with the model, only the
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+GROUPVIT_TEXT_INPUTS_DOCSTRING = r"""
+ Args:
+ input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
+ Indices of input sequence tokens in the vocabulary.
+
+ Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+ [`PreTrainedTokenizer.encode`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+ position_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+ config.max_position_embeddings - 1]`.
+
+ [What are position IDs?](../glossary#position-ids)
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
+ config will be used instead.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
+ used instead.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
+ eager mode, in graph mode the value will always be set to True.
+ training (`bool`, *optional*, defaults to `False``):
+ Whether or not to use the model in training mode (some modules like dropout modules have different
+ behaviors between training and evaluation).
+"""
+
+GROUPVIT_VISION_INPUTS_DOCSTRING = r"""
+ Args:
+ pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]`, `Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
+ Pixel values. Pixel values can be obtained using [`CLIPFeatureExtractor`]. See
+ [`CLIPFeatureExtractor.__call__`] for details.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
+ config will be used instead.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
+ used instead.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
+ eager mode, in graph mode the value will always be set to True.
+ training (`bool`, *optional*, defaults to `False``):
+ Whether or not to use the model in training mode (some modules like dropout modules have different
+ behaviors between training and evaluation).
+"""
+
+GROUPVIT_INPUTS_DOCSTRING = r"""
+ Args:
+ input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
+ Indices of input sequence tokens in the vocabulary.
+
+ Indices can be obtained using [`BertTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+ [`PreTrainedTokenizer.encode`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+ pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` `Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
+ Pixel values. Pixel values can be obtained using [`CLIPFeatureExtractor`]. See
+ [`CLIPFeatureExtractor.__call__`] for details.
+ attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+ position_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+ config.max_position_embeddings - 1]`.
+
+ [What are position IDs?](../glossary#position-ids)
+ return_loss (`bool`, *optional*):
+ Whether or not to return the contrastive loss.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
+ config will be used instead.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
+ used instead.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
+ eager mode, in graph mode the value will always be set to True.
+ training (`bool`, *optional*, defaults to `False``):
+ Whether or not to use the model in training mode (some modules like dropout modules have different
+ behaviors between training and evaluation).
+"""
+
+
+class TFGroupViTTextModel(TFGroupViTPreTrainedModel):
+ config_class = GroupViTTextConfig
+ main_input_name = "input_ids"
+
+ def __init__(self, config: GroupViTTextConfig, *inputs, **kwargs):
+ super().__init__(config, *inputs, **kwargs)
+
+ self.groupvit = TFGroupViTTextMainLayer(config, name="groupvit")
+
+ @property
+ def dummy_inputs(self) -> Dict[str, tf.Tensor]:
+ """
+ Dummy inputs to build the network.
+
+ Returns:
+ `Dict[str, tf.Tensor]`: The dummy inputs.
+ """
+ return {
+ "input_ids": tf.constant(DUMMY_INPUTS, dtype=tf.int32),
+ }
+
+ @tf.function(
+ input_signature=[
+ {
+ "input_ids": tf.TensorSpec((None, None), tf.int64, name="input_ids"),
+ "attention_mask": tf.TensorSpec((None, None), tf.int64, name="attention_mask"),
+ }
+ ]
+ )
+ def serving(self, inputs: Dict[str, tf.Tensor]) -> TFBaseModelOutputWithPooling:
+ output = self.call(inputs)
+ return self.serving_output(output)
+
+ @unpack_inputs
+ @add_start_docstrings_to_model_forward(GROUPVIT_TEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+ @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=GroupViTTextConfig)
+ def call(
+ self,
+ input_ids: Optional[TFModelInputType] = None,
+ attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+ position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ training: bool = False,
+ ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
+ r"""
+ Returns:
+
+ Examples:
+
+ ```python
+ >>> from transformers import CLIPTokenizer, TFGroupViTTextModel
+
+ >>> tokenizer = CLIPTokenizer.from_pretrained("nvidia/groupvit-gcc-yfcc")
+ >>> model = TFGroupViTTextModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
+
+ >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="tf")
+
+ >>> outputs = model(**inputs)
+ >>> last_hidden_state = outputs.last_hidden_state
+ >>> pooled_output = outputs.pooler_output # pooled (EOS token) states
+ ```"""
+
+ outputs = self.groupvit(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ training=training,
+ )
+
+ return outputs
+
+ def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling:
+ hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
+ attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
+
+ return TFBaseModelOutputWithPooling(
+ last_hidden_state=output.last_hidden_state,
+ pooler_output=output.pooler_output,
+ hidden_states=hs,
+ attentions=attns,
+ )
+
+
+class TFGroupViTVisionModel(TFGroupViTPreTrainedModel):
+ config_class = GroupViTVisionConfig
+ main_input_name = "pixel_values"
+
+ def __init__(self, config: GroupViTVisionConfig, *inputs, **kwargs):
+ super().__init__(config, *inputs, **kwargs)
+
+ self.groupvit = TFGroupViTVisionMainLayer(config, name="groupvit")
+
+ @property
+ def dummy_inputs(self) -> Dict[str, tf.Tensor]:
+ """
+ Dummy inputs to build the network.
+
+ Returns:
+ `Dict[str, tf.Tensor]`: The dummy inputs.
+ """
+ VISION_DUMMY_INPUTS = tf.random.uniform(
+ shape=(len(DUMMY_INPUTS), 3, self.config.image_size, self.config.image_size), dtype=tf.float32
+ )
+ return {"pixel_values": VISION_DUMMY_INPUTS}
+
+ @tf.function(
+ input_signature=[
+ {
+ "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
+ }
+ ]
+ )
+ def serving(self, inputs: Dict[str, tf.Tensor]) -> TFBaseModelOutputWithPooling:
+ """
+ Method used for serving the model.
+
+ Args:
+ inputs (`Dict[str, tf.Tensor]`):
+ The input of the saved model as a dictionary of tensors.
+ """
+ output = self.call(inputs)
+
+ return self.serving_output(output)
+
+ @unpack_inputs
+ @add_start_docstrings_to_model_forward(GROUPVIT_VISION_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=GroupViTVisionConfig)
+ def call(
+ self,
+ pixel_values: Optional[TFModelInputType] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ training: bool = False,
+ ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
+ r"""
+ Returns:
+
+ Examples:
+
+ ```python
+ >>> from PIL import Image
+ >>> import requests
+ >>> from transformers import AutoProcessor, TFGroupViTVisionModel
+
+ >>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")
+ >>> model = TFGroupViTVisionModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
+
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ >>> image = Image.open(requests.get(url, stream=True).raw)
+
+ >>> inputs = processor(images=image, return_tensors="tf")
+
+ >>> outputs = model(**inputs)
+ >>> last_hidden_state = outputs.last_hidden_state
+ >>> pooled_output = outputs.pooler_output # pooled CLS states
+ ```"""
+
+ outputs = self.groupvit(
+ pixel_values=pixel_values,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ training=training,
+ )
+
+ return outputs
+
+ def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling:
+ # hidden_states and attentions not converted to Tensor with tf.convert_to_tensor as they are all of different dimensions
+ return TFBaseModelOutputWithPooling(
+ last_hidden_state=output.last_hidden_state,
+ pooler_output=output.pooler_output,
+ hidden_states=output.hidden_states,
+ attentions=output.attentions,
+ )
+
+
+@add_start_docstrings(GROUPVIT_START_DOCSTRING)
+class TFGroupViTModel(TFGroupViTPreTrainedModel):
+ config_class = GroupViTConfig
+
+ def __init__(self, config: GroupViTConfig, *inputs, **kwargs):
+ super().__init__(config, *inputs, **kwargs)
+
+ self.groupvit = TFGroupViTMainLayer(config, name="groupvit")
+
+ @property
+ def dummy_inputs(self) -> Dict[str, tf.Tensor]:
+ """
+ Dummy inputs to build the network.
+
+ Returns:
+ `Dict[str, tf.Tensor]`: The dummy inputs.
+ """
+ VISION_DUMMY_INPUTS = tf.random.uniform(
+ shape=(len(DUMMY_INPUTS), 3, self.config.vision_config.image_size, self.config.vision_config.image_size),
+ dtype=tf.float32,
+ )
+ return {
+ "input_ids": tf.constant(DUMMY_INPUTS, dtype=tf.int32),
+ "pixel_values": VISION_DUMMY_INPUTS,
+ }
+
+ @tf.function(
+ input_signature=[
+ {
+ "input_ids": tf.TensorSpec((None, None), tf.int64, name="input_ids"),
+ "pixel_values": tf.TensorSpec((None, None, None, None), tf.float64, name="pixel_values"),
+ "attention_mask": tf.TensorSpec((None, None), tf.int64, name="attention_mask"),
+ }
+ ]
+ )
+ def serving(self, inputs: Dict[str, tf.Tensor]) -> TFGroupViTModelOutput:
+ """
+ Method used for serving the model.
+
+ Args:
+ inputs (`Dict[str, tf.Tensor]`):
+ The input of the saved model as a dictionary of tensors.
+ """
+ output = self.call(inputs)
+
+ return self.serving_output(output)
+
+ @unpack_inputs
+ @add_start_docstrings_to_model_forward(GROUPVIT_TEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+ def get_text_features(
+ self,
+ input_ids: Optional[TFModelInputType] = None,
+ attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+ position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ training: bool = False,
+ ) -> tf.Tensor:
+ r"""
+ Returns:
+ text_features (`tf.Tensor` of shape `(batch_size, output_dim`): The text embeddings obtained by applying
+ the projection layer to the pooled output of [`TFGroupViTTextModel`].
+
+ Examples:
+
+ ```python
+ >>> from transformers import CLIPTokenizer, TFGroupViTModel
+
+ >>> model = TFGroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
+ >>> tokenizer = CLIPTokenizer.from_pretrained("nvidia/groupvit-gcc-yfcc")
+
+ >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="tf")
+ >>> text_features = model.get_text_features(**inputs)
+ ```"""
+
+ text_features = self.groupvit.get_text_features(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ training=training,
+ )
+
+ return text_features
+
+ @unpack_inputs
+ @add_start_docstrings_to_model_forward(GROUPVIT_VISION_INPUTS_DOCSTRING)
+ def get_image_features(
+ self,
+ pixel_values: Optional[TFModelInputType] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ training: bool = False,
+ ) -> tf.Tensor:
+ r"""
+ Returns:
+ image_features (`tf.Tensor` of shape `(batch_size, output_dim`): The image embeddings obtained by applying
+ the projection layer to the pooled output of [`TFGroupViTVisionModel`].
+
+ Examples:
+
+ ```python
+ >>> from PIL import Image
+ >>> import requests
+ >>> from transformers import AutoProcessor, TFGroupViTModel
+
+ >>> model = TFGroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
+ >>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")
+
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ >>> image = Image.open(requests.get(url, stream=True).raw)
+
+ >>> inputs = processor(images=image, return_tensors="tf")
+
+ >>> image_features = model.get_image_features(**inputs)
+ ```"""
+
+ image_features = self.groupvit.get_image_features(
+ pixel_values=pixel_values,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ training=training,
+ )
+
+ return image_features
+
+ @unpack_inputs
+ @add_start_docstrings_to_model_forward(GROUPVIT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+ @replace_return_docstrings(output_type=TFGroupViTModelOutput, config_class=GroupViTConfig)
+ def call(
+ self,
+ input_ids: Optional[TFModelInputType] = None,
+ pixel_values: Optional[TFModelInputType] = None,
+ attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+ position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+ return_loss: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ output_segmentation: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ training: bool = False,
+ ) -> Union[TFGroupViTModelOutput, Tuple[tf.Tensor]]:
+ r"""
+ Returns:
+
+ Examples:
+
+ ```python
+ >>> from PIL import Image
+ >>> import requests
+ >>> from transformers import AutoProcessor, TFGroupViTModel
+
+ >>> model = TFGroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
+ >>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")
+
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ >>> image = Image.open(requests.get(url, stream=True).raw)
+
+ >>> inputs = processor(
+ ... text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="tf", padding=True
+ ... )
+
+ >>> outputs = model(**inputs)
+ >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
+ >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
+ ```"""
+
+ outputs = self.groupvit(
+ input_ids=input_ids,
+ pixel_values=pixel_values,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ return_loss=return_loss,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ output_segmentation=output_segmentation,
+ return_dict=return_dict,
+ training=training,
+ )
+
+ return outputs
+
+ def serving_output(self, output: TFGroupViTModelOutput) -> TFGroupViTModelOutput:
+ # TODO: As is this currently fails with saved_model=True, because
+ # TensorFlow cannot trace through nested dataclasses. Reference:
+ # https://github.com/huggingface/transformers/pull/16886
+ return output
diff --git a/src/transformers/utils/dummy_tf_objects.py b/src/transformers/utils/dummy_tf_objects.py
index 69e11eeb3..3acc78046 100644
--- a/src/transformers/utils/dummy_tf_objects.py
+++ b/src/transformers/utils/dummy_tf_objects.py
@@ -1309,6 +1309,37 @@ class TFGPTJPreTrainedModel(metaclass=DummyObject):
requires_backends(self, ["tf"])
+TF_GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class TFGroupViTModel(metaclass=DummyObject):
+ _backends = ["tf"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["tf"])
+
+
+class TFGroupViTPreTrainedModel(metaclass=DummyObject):
+ _backends = ["tf"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["tf"])
+
+
+class TFGroupViTTextModel(metaclass=DummyObject):
+ _backends = ["tf"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["tf"])
+
+
+class TFGroupViTVisionModel(metaclass=DummyObject):
+ _backends = ["tf"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["tf"])
+
+
TF_HUBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
diff --git a/tests/models/groupvit/test_modeling_groupvit.py b/tests/models/groupvit/test_modeling_groupvit.py
index bd6dbd3bc..76b9d02a8 100644
--- a/tests/models/groupvit/test_modeling_groupvit.py
+++ b/tests/models/groupvit/test_modeling_groupvit.py
@@ -17,6 +17,7 @@
import inspect
import os
+import random
import tempfile
import unittest
@@ -24,7 +25,7 @@ import numpy as np
import requests
from transformers import GroupViTConfig, GroupViTTextConfig, GroupViTVisionConfig
-from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.testing_utils import is_pt_tf_cross_test, require_torch, require_vision, slow, torch_device
from transformers.utils import is_torch_available, is_vision_available
from ...test_configuration_common import ConfigTester
@@ -95,7 +96,8 @@ class GroupViTVisionModelTester:
self.seq_length = num_patches
def prepare_config_and_inputs(self):
- pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+ rng = random.Random(0)
+ pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size], rng=rng)
config = self.get_config()
return config, pixel_values
@@ -161,6 +163,18 @@ class GroupViTVisionModelTest(ModelTesterMixin, unittest.TestCase):
def test_inputs_embeds(self):
pass
+ @is_pt_tf_cross_test
+ def test_pt_tf_model_equivalence(self):
+ import tensorflow as tf
+
+ seed = 338
+ random.seed(seed)
+ np.random.seed(seed)
+ torch.manual_seed(seed)
+ torch.cuda.manual_seed_all(seed)
+ tf.random.set_seed(seed)
+ return super().test_pt_tf_model_equivalence()
+
def test_model_common_attributes(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
@@ -368,7 +382,8 @@ class GroupViTTextModelTester:
self.scope = scope
def prepare_config_and_inputs(self):
- input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+ rng = random.Random(0)
+ input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size, rng=rng)
input_mask = None
if self.use_input_mask:
@@ -532,6 +547,18 @@ class GroupViTModelTest(ModelTesterMixin, unittest.TestCase):
def test_model_common_attributes(self):
pass
+ @is_pt_tf_cross_test
+ def test_pt_tf_model_equivalence(self):
+ import tensorflow as tf
+
+ seed = 163
+ random.seed(seed)
+ np.random.seed(seed)
+ torch.manual_seed(seed)
+ torch.cuda.manual_seed_all(seed)
+ tf.random.set_seed(seed)
+ return super().test_pt_tf_model_equivalence()
+
# override as the `logit_scale` parameter initilization is different for GROUPVIT
def test_initialization(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/tests/models/groupvit/test_modeling_tf_groupvit.py b/tests/models/groupvit/test_modeling_tf_groupvit.py
new file mode 100644
index 000000000..8c4053a2c
--- /dev/null
+++ b/tests/models/groupvit/test_modeling_tf_groupvit.py
@@ -0,0 +1,715 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the TensorFlow GroupViT model. """
+
+
+import inspect
+import os
+import random
+import tempfile
+import unittest
+from importlib import import_module
+
+import numpy as np
+
+import requests
+from transformers import GroupViTConfig, GroupViTTextConfig, GroupViTVisionConfig
+from transformers.testing_utils import is_pt_tf_cross_test, require_tf, require_vision, slow
+from transformers.utils import is_tf_available, is_vision_available
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
+
+
+if is_tf_available():
+ import tensorflow as tf
+
+ from transformers import TFGroupViTModel, TFGroupViTTextModel, TFGroupViTVisionModel, TFSharedEmbeddings
+ from transformers.models.groupvit.modeling_tf_groupvit import TF_GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST
+
+
+if is_vision_available():
+ from PIL import Image
+
+ from transformers import CLIPProcessor
+
+
+class TFGroupViTVisionModelTester:
+ def __init__(
+ self,
+ parent,
+ batch_size=12,
+ image_size=30,
+ patch_size=2,
+ num_channels=3,
+ is_training=True,
+ hidden_size=32,
+ depths=[6, 3, 3],
+ num_group_tokens=[64, 8, 0],
+ num_output_groups=[64, 8, 8],
+ num_attention_heads=4,
+ intermediate_size=37,
+ dropout=0.1,
+ attention_dropout=0.1,
+ initializer_range=0.02,
+ scope=None,
+ ):
+ self.parent = parent
+ self.batch_size = batch_size
+ self.image_size = image_size
+ self.patch_size = patch_size
+ self.num_channels = num_channels
+ self.is_training = is_training
+ self.hidden_size = hidden_size
+ self.depths = depths
+ self.num_hidden_layers = sum(depths)
+ self.expected_num_hidden_layers = len(depths) + 1
+ self.num_group_tokens = num_group_tokens
+ self.num_output_groups = num_output_groups
+ self.num_attention_heads = num_attention_heads
+ self.intermediate_size = intermediate_size
+ self.dropout = dropout
+ self.attention_dropout = attention_dropout
+ self.initializer_range = initializer_range
+ self.scope = scope
+
+ num_patches = (image_size // patch_size) ** 2
+ # no [CLS] token for GroupViT
+ self.seq_length = num_patches
+
+ def prepare_config_and_inputs(self):
+
+ rng = random.Random(0)
+ pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size], rng=rng)
+ config = self.get_config()
+
+ return config, pixel_values
+
+ def get_config(self):
+ return GroupViTVisionConfig(
+ image_size=self.image_size,
+ patch_size=self.patch_size,
+ num_channels=self.num_channels,
+ hidden_size=self.hidden_size,
+ depths=self.depths,
+ num_group_tokens=self.num_group_tokens,
+ num_output_groups=self.num_output_groups,
+ num_attention_heads=self.num_attention_heads,
+ intermediate_size=self.intermediate_size,
+ dropout=self.dropout,
+ attention_dropout=self.attention_dropout,
+ initializer_range=self.initializer_range,
+ )
+
+ def create_and_check_model(self, config, pixel_values):
+ model = TFGroupViTVisionModel(config=config)
+ result = model(pixel_values, training=False)
+ self.parent.assertEqual(
+ result.last_hidden_state.shape, (self.batch_size, self.num_output_groups[-1], self.hidden_size)
+ )
+ self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+ def prepare_config_and_inputs_for_common(self):
+ config_and_inputs = self.prepare_config_and_inputs()
+ config, pixel_values = config_and_inputs
+ inputs_dict = {"pixel_values": pixel_values}
+ return config, inputs_dict
+
+
+@require_tf
+class TFGroupViTVisionModelTest(TFModelTesterMixin, unittest.TestCase):
+ """
+ Here we also overwrite some of the tests of test_modeling_common.py, as GroupViT does not use input_ids, inputs_embeds,
+ attention_mask and seq_length.
+ """
+
+ all_model_classes = (TFGroupViTVisionModel,) if is_tf_available() else ()
+
+ test_pruning = False
+ test_resize_embeddings = False
+ test_head_masking = False
+ test_onnx = False
+
+ def setUp(self):
+ self.model_tester = TFGroupViTVisionModelTester(self)
+ self.config_tester = ConfigTester(
+ self, config_class=GroupViTVisionConfig, has_text_modality=False, hidden_size=37
+ )
+
+ def test_config(self):
+ self.config_tester.run_common_tests()
+
+ @unittest.skip(reason="GroupViT does not use inputs_embeds")
+ def test_inputs_embeds(self):
+ pass
+
+ @unittest.skip(reason="GroupViT does not use inputs_embeds")
+ def test_graph_mode_with_inputs_embeds(self):
+ pass
+
+ def test_model_common_attributes(self):
+ config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+ for model_class in self.all_model_classes:
+ model = model_class(config)
+ self.assertIsInstance(model.get_input_embeddings(), (tf.keras.layers.Layer))
+ x = model.get_output_embeddings()
+ self.assertTrue(x is None or isinstance(x, tf.keras.layers.Layer))
+
+ def test_forward_signature(self):
+ config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+ for model_class in self.all_model_classes:
+ model = model_class(config)
+ signature = inspect.signature(model.call)
+ # signature.parameters is an OrderedDict => so arg_names order is deterministic
+ arg_names = [*signature.parameters.keys()]
+
+ expected_arg_names = ["pixel_values"]
+ self.assertListEqual(arg_names[:1], expected_arg_names)
+
+ def test_model(self):
+ config_and_inputs = self.model_tester.prepare_config_and_inputs()
+ self.model_tester.create_and_check_model(*config_and_inputs)
+
+ def test_attention_outputs(self):
+ config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+ config.return_dict = True
+
+ seq_len = getattr(self.model_tester, "seq_length", None)
+
+ expected_num_attention_outputs = sum(g > 0 for g in self.model_tester.num_group_tokens)
+
+ for model_class in self.all_model_classes:
+ inputs_dict["output_attentions"] = True
+ inputs_dict["output_hidden_states"] = False
+ config.return_dict = True
+ model = model_class(config)
+ outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)
+ attentions = outputs.attentions
+ # GroupViT returns attention grouping of each stage
+ self.assertEqual(len(attentions), sum(g > 0 for g in self.model_tester.num_group_tokens))
+
+ # check that output_attentions also work using config
+ del inputs_dict["output_attentions"]
+ config.output_attentions = True
+ model = model_class(config)
+ outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)
+ attentions = outputs.attentions
+ # GroupViT returns attention grouping of each stage
+ self.assertEqual(len(attentions), expected_num_attention_outputs)
+
+ out_len = len(outputs)
+
+ # Check attention is always last and order is fine
+ inputs_dict["output_attentions"] = True
+ inputs_dict["output_hidden_states"] = True
+ model = model_class(config)
+ outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)
+
+ added_hidden_states = 1
+ self.assertEqual(out_len + added_hidden_states, len(outputs))
+
+ self_attentions = outputs.attentions
+
+ # GroupViT returns attention grouping of each stage
+ self.assertEqual(len(self_attentions), expected_num_attention_outputs)
+ for i, self_attn in enumerate(self_attentions):
+ if self_attn is None:
+ continue
+
+ self.assertListEqual(
+ list(self_attentions[i].shape[-2:]),
+ [
+ self.model_tester.num_output_groups[i],
+ self.model_tester.num_output_groups[i - 1] if i > 0 else seq_len,
+ ],
+ )
+
+ def test_hidden_states_output(self):
+ def check_hidden_states_output(inputs_dict, config, model_class):
+ model = model_class(config)
+
+ outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)
+
+ hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
+
+ expected_num_layers = getattr(
+ self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+ )
+ self.assertEqual(len(hidden_states), expected_num_layers)
+
+ seq_length = getattr(self.model_tester, "seq_length", None)
+
+ self.assertListEqual(
+ list(hidden_states[0].shape[-2:]),
+ [seq_length, self.model_tester.hidden_size],
+ )
+
+ config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+ for model_class in self.all_model_classes:
+ inputs_dict["output_hidden_states"] = True
+ check_hidden_states_output(inputs_dict, config, model_class)
+
+ # check that output_hidden_states also work using config
+ del inputs_dict["output_hidden_states"]
+ config.output_hidden_states = True
+
+ check_hidden_states_output(inputs_dict, config, model_class)
+
+ @is_pt_tf_cross_test
+ def test_pt_tf_model_equivalence(self):
+ # `GroupViT` computes some indices using argmax, uses them as
+ # one-hot encoding for further computation. The problem is
+ # while PT/TF have very small difference in `y_soft` (~ 1e-9),
+ # the argmax could be totally different, if there are at least
+ # 2 indices with almost identical values. This leads to very
+ # large difference in the outputs. We need specific seeds to
+ # avoid almost identical values happening in `y_soft`.
+ import torch
+
+ seed = 338
+ random.seed(seed)
+ np.random.seed(seed)
+ torch.manual_seed(seed)
+ torch.cuda.manual_seed_all(seed)
+ tf.random.set_seed(seed)
+ return super().test_pt_tf_model_equivalence()
+
+ @slow
+ def test_model_from_pretrained(self):
+ for model_name in TF_GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+ model = TFGroupViTVisionModel.from_pretrained(model_name)
+ self.assertIsNotNone(model)
+
+ @slow
+ def test_saved_model_creation_extended(self):
+ config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+ config.output_hidden_states = True
+ config.output_attentions = True
+
+ if hasattr(config, "use_cache"):
+ config.use_cache = True
+
+ seq_len = getattr(self.model_tester, "seq_length", None)
+
+ for model_class in self.all_model_classes:
+ class_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+ model = model_class(config)
+ num_out = len(model(class_inputs_dict))
+
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ model.save_pretrained(tmpdirname, saved_model=True)
+ saved_model_dir = os.path.join(tmpdirname, "saved_model", "1")
+ model = tf.keras.models.load_model(saved_model_dir)
+ outputs = model(class_inputs_dict)
+ output_hidden_states = outputs["hidden_states"]
+ output_attentions = outputs["attentions"]
+
+ # Check num outputs
+ self.assertEqual(len(outputs), num_out)
+
+ # Check num layers
+ expected_num_layers = getattr(
+ self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+ )
+
+ self.assertEqual(len(output_hidden_states), expected_num_layers)
+ self.assertEqual(len(output_attentions), self.model_tester.num_hidden_layers)
+
+ # Check attention outputs
+ image_size = (self.model_tester.image_size, self.model_tester.image_size)
+ patch_size = (self.model_tester.patch_size, self.model_tester.patch_size)
+ num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+ seq_len = num_patches + 1
+
+ self.assertListEqual(
+ list(output_attentions[0].shape[-3:]),
+ [self.model_tester.num_attention_heads, seq_len, seq_len],
+ )
+
+ # Check hidden states
+ self.assertListEqual(
+ list(output_hidden_states[0].shape[-2:]),
+ [seq_len, self.model_tester.hidden_size],
+ )
+
+
+class TFGroupViTTextModelTester:
+ def __init__(
+ self,
+ parent,
+ batch_size=12,
+ seq_length=7,
+ is_training=True,
+ use_input_mask=True,
+ use_labels=True,
+ vocab_size=99,
+ hidden_size=32,
+ num_hidden_layers=5,
+ num_attention_heads=4,
+ intermediate_size=37,
+ dropout=0.1,
+ attention_dropout=0.1,
+ max_position_embeddings=512,
+ initializer_range=0.02,
+ scope=None,
+ ):
+ self.parent = parent
+ self.batch_size = batch_size
+ self.seq_length = seq_length
+ self.is_training = is_training
+ self.use_input_mask = use_input_mask
+ self.use_labels = use_labels
+ self.vocab_size = vocab_size
+ self.hidden_size = hidden_size
+ self.num_hidden_layers = num_hidden_layers
+ self.num_attention_heads = num_attention_heads
+ self.intermediate_size = intermediate_size
+ self.dropout = dropout
+ self.attention_dropout = attention_dropout
+ self.max_position_embeddings = max_position_embeddings
+ self.initializer_range = initializer_range
+ self.scope = scope
+
+ def prepare_config_and_inputs(self):
+ rng = random.Random(0)
+ input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size, rng=rng)
+
+ input_mask = None
+ if self.use_input_mask:
+ input_mask = random_attention_mask([self.batch_size, self.seq_length])
+ # make sure the first token has attention mask `1` to ensure that, after combining the causal mask, there
+ # is still at least one token being attended to for each batch.
+ # TODO: Change `random_attention_mask` in PT/TF/Flax common test file, after a discussion with the team.
+ input_mask = tf.concat(
+ [tf.ones_like(input_mask[:, :1], dtype=input_mask.dtype), input_mask[:, 1:]], axis=-1
+ )
+
+ config = self.get_config()
+
+ return config, input_ids, input_mask
+
+ def get_config(self):
+ return GroupViTTextConfig(
+ vocab_size=self.vocab_size,
+ hidden_size=self.hidden_size,
+ num_hidden_layers=self.num_hidden_layers,
+ num_attention_heads=self.num_attention_heads,
+ intermediate_size=self.intermediate_size,
+ dropout=self.dropout,
+ attention_dropout=self.attention_dropout,
+ max_position_embeddings=self.max_position_embeddings,
+ initializer_range=self.initializer_range,
+ )
+
+ def create_and_check_model(self, config, input_ids, input_mask):
+ model = TFGroupViTTextModel(config=config)
+ result = model(input_ids, attention_mask=input_mask, training=False)
+ result = model(input_ids, training=False)
+ self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+ self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+ def prepare_config_and_inputs_for_common(self):
+ config_and_inputs = self.prepare_config_and_inputs()
+ config, input_ids, input_mask = config_and_inputs
+ inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+ return config, inputs_dict
+
+
+@require_tf
+class TFGroupViTTextModelTest(TFModelTesterMixin, unittest.TestCase):
+
+ all_model_classes = (TFGroupViTTextModel,) if is_tf_available() else ()
+ test_pruning = False
+ test_head_masking = False
+ test_onnx = False
+
+ def setUp(self):
+ self.model_tester = TFGroupViTTextModelTester(self)
+ self.config_tester = ConfigTester(self, config_class=GroupViTTextConfig, hidden_size=37)
+
+ def test_config(self):
+ self.config_tester.run_common_tests()
+
+ def test_model(self):
+ config_and_inputs = self.model_tester.prepare_config_and_inputs()
+ self.model_tester.create_and_check_model(*config_and_inputs)
+
+ @unittest.skip(reason="GroupViTTextModel does not use inputs_embeds")
+ def test_inputs_embeds(self):
+ pass
+
+ @slow
+ def test_model_from_pretrained(self):
+ for model_name in TF_GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+ model = TFGroupViTTextModel.from_pretrained(model_name)
+ self.assertIsNotNone(model)
+
+ @slow
+ def test_saved_model_creation_extended(self):
+ config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+ config.output_hidden_states = True
+ config.output_attentions = True
+
+ if hasattr(config, "use_cache"):
+ config.use_cache = True
+
+ for model_class in self.all_model_classes:
+ class_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+ model = model_class(config)
+ num_out = len(model(class_inputs_dict))
+
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ model.save_pretrained(tmpdirname, saved_model=True)
+ saved_model_dir = os.path.join(tmpdirname, "saved_model", "1")
+ model = tf.keras.models.load_model(saved_model_dir)
+ outputs = model(class_inputs_dict)
+ output_hidden_states = outputs["hidden_states"]
+ output_attentions = outputs["attentions"]
+
+ # Check number of outputs
+ self.assertEqual(len(outputs), num_out)
+
+ # Check number of layers
+ expected_num_layers = getattr(
+ self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+ )
+
+ # Check hidden states
+ self.assertEqual(len(output_hidden_states), expected_num_layers)
+ self.assertListEqual(
+ list(output_hidden_states[0].shape[-2:]),
+ [self.model_tester.seq_length, self.model_tester.hidden_size],
+ )
+
+ # Check attention outputs
+ self.assertEqual(len(output_attentions), self.model_tester.num_hidden_layers)
+
+ seq_length = self.model_tester.seq_length
+ key_length = getattr(self.model_tester, "key_length", seq_length)
+
+ self.assertListEqual(
+ list(output_attentions[0].shape[-3:]),
+ [self.model_tester.num_attention_heads, seq_length, key_length],
+ )
+
+
+class TFGroupViTModelTester:
+ def __init__(self, parent, is_training=True):
+ self.parent = parent
+ self.text_model_tester = TFGroupViTTextModelTester(parent)
+ self.vision_model_tester = TFGroupViTVisionModelTester(parent)
+ self.is_training = is_training
+
+ def prepare_config_and_inputs(self):
+ text_config, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
+ vision_config, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
+
+ config = self.get_config()
+
+ return config, input_ids, attention_mask, pixel_values
+
+ def get_config(self):
+ return GroupViTConfig.from_text_vision_configs(
+ self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64
+ )
+
+ def create_and_check_model(self, config, input_ids, attention_mask, pixel_values):
+ model = TFGroupViTModel(config)
+ result = model(input_ids, pixel_values, attention_mask, training=False)
+ self.parent.assertEqual(
+ result.logits_per_image.shape, (self.vision_model_tester.batch_size, self.text_model_tester.batch_size)
+ )
+ self.parent.assertEqual(
+ result.logits_per_text.shape, (self.text_model_tester.batch_size, self.vision_model_tester.batch_size)
+ )
+
+ def prepare_config_and_inputs_for_common(self):
+ config_and_inputs = self.prepare_config_and_inputs()
+ config, input_ids, attention_mask, pixel_values = config_and_inputs
+ inputs_dict = {
+ "input_ids": input_ids,
+ "attention_mask": attention_mask,
+ "pixel_values": pixel_values,
+ "return_loss": True,
+ }
+ return config, inputs_dict
+
+
+@require_tf
+class TFGroupViTModelTest(TFModelTesterMixin, unittest.TestCase):
+ all_model_classes = (TFGroupViTModel,) if is_tf_available() else ()
+ test_head_masking = False
+ test_pruning = False
+ test_resize_embeddings = False
+ test_attention_outputs = False
+ test_onnx = False
+
+ def setUp(self):
+ self.model_tester = TFGroupViTModelTester(self)
+
+ def test_model(self):
+ config_and_inputs = self.model_tester.prepare_config_and_inputs()
+ self.model_tester.create_and_check_model(*config_and_inputs)
+
+ @unittest.skip(reason="hidden_states are tested in individual model tests")
+ def test_hidden_states_output(self):
+ pass
+
+ @unittest.skip(reason="input_embeds are tested in individual model tests")
+ def test_inputs_embeds(self):
+ pass
+
+ @unittest.skip(reason="CLIPModel does not have input/output embeddings")
+ def test_model_common_attributes(self):
+ pass
+
+ @is_pt_tf_cross_test
+ def test_pt_tf_model_equivalence(self):
+ # `GroupViT` computes some indices using argmax, uses them as
+ # one-hot encoding for further computation. The problem is
+ # while PT/TF have very small difference in `y_soft` (~ 1e-9),
+ # the argmax could be totally different, if there are at least
+ # 2 indices with almost identical values. This leads to very
+ # large difference in the outputs. We need specific seeds to
+ # avoid almost identical values happening in `y_soft`.
+ import torch
+
+ seed = 158
+ random.seed(seed)
+ np.random.seed(seed)
+ torch.manual_seed(seed)
+ torch.cuda.manual_seed_all(seed)
+ tf.random.set_seed(seed)
+ return super().test_pt_tf_model_equivalence()
+
+ # overwrite from common since `TFGroupViTModelTester` set `return_loss` to `True` and causes the preparation of
+ # `symbolic_inputs` failed.
+ def test_keras_save_load(self):
+ config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+ # remove `return_loss` to make code work
+ if self.__class__.__name__ == "TFGroupViTModelTest":
+ inputs_dict.pop("return_loss", None)
+
+ tf_main_layer_classes = set(
+ module_member
+ for model_class in self.all_model_classes
+ for module in (import_module(model_class.__module__),)
+ for module_member_name in dir(module)
+ if module_member_name.endswith("MainLayer")
+ # This condition is required, since `modeling_tf_clip.py` has 3 classes whose names end with `MainLayer`.
+ and module_member_name[: -len("MainLayer")] == model_class.__name__[: -len("Model")]
+ for module_member in (getattr(module, module_member_name),)
+ if isinstance(module_member, type)
+ and tf.keras.layers.Layer in module_member.__bases__
+ and getattr(module_member, "_keras_serializable", False)
+ )
+ for main_layer_class in tf_main_layer_classes:
+ # T5MainLayer needs an embed_tokens parameter when called without the inputs_embeds parameter
+ if "T5" in main_layer_class.__name__:
+ # Take the same values than in TFT5ModelTester for this shared layer
+ shared = TFSharedEmbeddings(99, 32, name="shared")
+ config.use_cache = inputs_dict.pop("use_cache", None)
+ main_layer = main_layer_class(config, embed_tokens=shared)
+ else:
+ main_layer = main_layer_class(config)
+
+ symbolic_inputs = {
+ name: tf.keras.Input(tensor.shape[1:], dtype=tensor.dtype) for name, tensor in inputs_dict.items()
+ }
+
+ model = tf.keras.Model(symbolic_inputs, outputs=main_layer(symbolic_inputs))
+ outputs = model(inputs_dict)
+
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ filepath = os.path.join(tmpdirname, "keras_model.h5")
+ model.save(filepath)
+ if "T5" in main_layer_class.__name__:
+ model = tf.keras.models.load_model(
+ filepath,
+ custom_objects={
+ main_layer_class.__name__: main_layer_class,
+ "TFSharedEmbeddings": TFSharedEmbeddings,
+ },
+ )
+ else:
+ model = tf.keras.models.load_model(
+ filepath, custom_objects={main_layer_class.__name__: main_layer_class}
+ )
+ assert isinstance(model, tf.keras.Model)
+ after_outputs = model(inputs_dict)
+ self.assert_outputs_same(after_outputs, outputs)
+
+ @slow
+ def test_model_from_pretrained(self):
+ for model_name in TF_GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+ model = TFGroupViTModel.from_pretrained(model_name)
+ self.assertIsNotNone(model)
+
+ @unittest.skip(reason="Currently `saved_model` doesn't work with nested outputs.")
+ @slow
+ def test_saved_model_creation(self):
+ pass
+
+ @unittest.skip(reason="Currently `saved_model` doesn't work with nested outputs.")
+ @slow
+ def test_saved_model_creation_extended(self):
+ pass
+
+ @unittest.skip(reason="`saved_model` doesn't work with nested outputs so no preparation happens.")
+ @slow
+ def test_prepare_serving_output(self):
+ pass
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+ url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+ im = Image.open(requests.get(url, stream=True).raw)
+ return im
+
+
+@require_vision
+@require_tf
+class TFGroupViTModelIntegrationTest(unittest.TestCase):
+ @slow
+ def test_inference(self):
+ model_name = "nvidia/groupvit-gcc-yfcc"
+ model = TFGroupViTModel.from_pretrained(model_name)
+ processor = CLIPProcessor.from_pretrained(model_name)
+
+ image = prepare_img()
+ inputs = processor(
+ text=["a photo of a cat", "a photo of a dog"], images=image, padding=True, return_tensors="tf"
+ )
+
+ outputs = model(**inputs, training=False)
+
+ # verify the logits
+ self.assertEqual(
+ outputs.logits_per_image.shape,
+ tf.TensorShape((inputs.pixel_values.shape[0], inputs.input_ids.shape[0])),
+ )
+ self.assertEqual(
+ outputs.logits_per_text.shape,
+ tf.TensorShape((inputs.input_ids.shape[0], inputs.pixel_values.shape[0])),
+ )
+
+ expected_logits = tf.constant([[13.3523, 6.3629]])
+
+ tf.debugging.assert_near(outputs.logits_per_image, expected_logits, atol=1e-3)
diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py
index 9977578b5..b93d8f17e 100644
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -757,7 +757,7 @@ class TFModelTesterMixin:
name="pixel_values",
dtype="float32",
)
- elif model_class.__name__ in ["TFCLIPModel"]:
+ elif model_class.__name__ in ["TFCLIPModel", "TFGroupViTModel"]:
inputs = {
"input_ids": tf.keras.Input(batch_shape=(3, max_input), name="input_ids", dtype="int32"),
"pixel_values": tf.keras.Input(
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 988967e79..5a6e4bd24 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -163,6 +163,8 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
"GroupViTVisionModel",
"TFCLIPTextModel",
"TFCLIPVisionModel",
+ "TFGroupViTTextModel",
+ "TFGroupViTVisionModel",
"FlaxCLIPTextModel",
"FlaxCLIPVisionModel",
"FlaxWav2Vec2ForCTC",
diff --git a/utils/documentation_tests.txt b/utils/documentation_tests.txt
index 48fc71d6f..eb1570d6c 100644
--- a/utils/documentation_tests.txt
+++ b/utils/documentation_tests.txt
@@ -39,6 +39,8 @@ src/transformers/models/electra/modeling_tf_electra.py
src/transformers/models/glpn/modeling_glpn.py
src/transformers/models/gpt2/modeling_gpt2.py
src/transformers/models/gptj/modeling_gptj.py
+src/transformers/models/groupvit/modeling_groupvit.py
+src/transformers/models/groupvit/modeling_tf_groupvit.py
src/transformers/models/hubert/modeling_hubert.py
src/transformers/models/layoutlm/modeling_layoutlm.py
src/transformers/models/layoutlm/modeling_tf_layoutlm.py