mirror of
https://github.com/saymrwulf/transformers.git
synced 2026-05-14 20:58:08 +00:00
[several models] improve readability (#24585)
* [modeling_clip.py] improve readability * apply to other models * fix
This commit is contained in:
parent
134caef31a
commit
49e812d12b
14 changed files with 15 additions and 15 deletions
|
|
@ -1444,7 +1444,7 @@ class AlignModel(AlignPreTrainedModel):
|
|||
self.vision_model = AlignVisionModel(vision_config)
|
||||
|
||||
self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim)
|
||||
self.temperature = nn.Parameter(torch.ones([]) * self.config.temperature_init_value)
|
||||
self.temperature = nn.Parameter(torch.tensor(self.config.temperature_init_value))
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
|
|
|||
|
|
@ -1506,7 +1506,7 @@ class AltCLIPModel(AltCLIPPreTrainedModel):
|
|||
|
||||
self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
|
||||
self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
|
||||
self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
|
||||
self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
|
|
|||
|
|
@ -743,7 +743,7 @@ class BlipModel(BlipPreTrainedModel):
|
|||
|
||||
self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
|
||||
self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
|
||||
self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
|
||||
self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
|
|
|||
|
|
@ -1778,7 +1778,7 @@ class BridgeTowerForContrastiveLearning(BridgeTowerPreTrainedModel):
|
|||
self.itc_image_head = BridgeTowerContrastiveHead(config.hidden_size, config.contrastive_hidden_size)
|
||||
self.itc_cross_modal_head = BridgeTowerContrastiveHead(config.hidden_size * 2, config.contrastive_hidden_size)
|
||||
|
||||
self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
|
||||
self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
||||
|
|
|
|||
|
|
@ -1376,7 +1376,7 @@ class ChineseCLIPModel(ChineseCLIPPreTrainedModel):
|
|||
|
||||
self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
|
||||
self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
|
||||
self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
|
||||
self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
|
|
|||
|
|
@ -1956,8 +1956,8 @@ class ClapModel(ClapPreTrainedModel):
|
|||
text_config = config.text_config
|
||||
audio_config = config.audio_config
|
||||
|
||||
self.logit_scale_a = nn.Parameter(torch.ones([]) * np.log(config.logit_scale_init_value))
|
||||
self.logit_scale_t = nn.Parameter(torch.ones([]) * np.log(config.logit_scale_init_value))
|
||||
self.logit_scale_a = nn.Parameter(torch.tensor(np.log(config.logit_scale_init_value)))
|
||||
self.logit_scale_t = nn.Parameter(torch.tensor(np.log(config.logit_scale_init_value)))
|
||||
|
||||
self.projection_dim = config.projection_dim
|
||||
|
||||
|
|
|
|||
|
|
@ -977,7 +977,7 @@ class CLIPModel(CLIPPreTrainedModel):
|
|||
|
||||
self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
|
||||
self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
|
||||
self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
|
||||
self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
|
|
|||
|
|
@ -979,7 +979,7 @@ class CLIPSegModel(CLIPSegPreTrainedModel):
|
|||
|
||||
self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
|
||||
self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
|
||||
self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
|
||||
self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
|
|
|||
|
|
@ -1229,7 +1229,7 @@ class FlavaModel(FlavaPreTrainedModel):
|
|||
|
||||
self.image_projection = nn.Linear(self.image_hidden_size, self.projection_dim)
|
||||
self.text_projection = nn.Linear(self.text_hidden_size, self.projection_dim)
|
||||
self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
|
||||
self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
|
||||
|
||||
self.image_to_mm_projection = nn.Linear(self.image_hidden_size, self.mm_hidden_size)
|
||||
self.text_to_mm_projection = nn.Linear(self.text_hidden_size, self.mm_hidden_size)
|
||||
|
|
|
|||
|
|
@ -1368,7 +1368,7 @@ class GroupViTModel(GroupViTPreTrainedModel):
|
|||
nn.ReLU(inplace=True),
|
||||
nn.Linear(self.projection_intermediate_dim, self.projection_dim, bias=True),
|
||||
)
|
||||
self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
|
||||
self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
|
|
|||
|
|
@ -399,7 +399,7 @@ class OneFormerLoss(nn.Module):
|
|||
self.importance_sample_ratio = importance_sample_ratio
|
||||
self.contrastive_temperature = contrastive_temperature
|
||||
if self.contrastive_temperature is not None:
|
||||
self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / contrastive_temperature))
|
||||
self.logit_scale = nn.Parameter(torch.tensor(np.log(1 / contrastive_temperature)))
|
||||
|
||||
def _max_by_axis(self, the_list: List[List[int]]) -> List[int]:
|
||||
maxes = the_list[0]
|
||||
|
|
|
|||
|
|
@ -1065,7 +1065,7 @@ class OwlViTModel(OwlViTPreTrainedModel):
|
|||
|
||||
self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
|
||||
self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
|
||||
self.logit_scale = nn.Parameter(torch.ones([]) * config.logit_scale_init_value)
|
||||
self.logit_scale = nn.Parameter(torch.tensor(config.logit_scale_init_value))
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
|
|
|||
|
|
@ -204,7 +204,7 @@ class VisionTextDualEncoderModel(PreTrainedModel):
|
|||
|
||||
self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
|
||||
self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
|
||||
self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
|
||||
self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
|
||||
|
||||
@add_start_docstrings_to_model_forward(VISION_TEXT_DUAL_ENCODER_TEXT_INPUTS_DOCSTRING)
|
||||
def get_text_features(
|
||||
|
|
|
|||
|
|
@ -1309,7 +1309,7 @@ class XCLIPModel(XCLIPPreTrainedModel):
|
|||
|
||||
self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
|
||||
self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
|
||||
self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
|
||||
self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
|
||||
|
||||
self.prompts_visual_layernorm = nn.LayerNorm(self.vision_embed_dim, eps=config.vision_config.layer_norm_eps)
|
||||
self.prompts_visual_projection = nn.Parameter(torch.randn(self.vision_embed_dim, self.projection_dim))
|
||||
|
|
|
|||
Loading…
Reference in a new issue