diff --git a/src/transformers/models/align/modeling_align.py b/src/transformers/models/align/modeling_align.py index a7d31775c..a60be9422 100644 --- a/src/transformers/models/align/modeling_align.py +++ b/src/transformers/models/align/modeling_align.py @@ -1444,7 +1444,7 @@ class AlignModel(AlignPreTrainedModel): self.vision_model = AlignVisionModel(vision_config) self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim) - self.temperature = nn.Parameter(torch.ones([]) * self.config.temperature_init_value) + self.temperature = nn.Parameter(torch.tensor(self.config.temperature_init_value)) # Initialize weights and apply final processing self.post_init() diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py index 90188c044..820c0ad02 100755 --- a/src/transformers/models/altclip/modeling_altclip.py +++ b/src/transformers/models/altclip/modeling_altclip.py @@ -1506,7 +1506,7 @@ class AltCLIPModel(AltCLIPPreTrainedModel): self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False) self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False) - self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value) + self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value)) # Initialize weights and apply final processing self.post_init() diff --git a/src/transformers/models/blip/modeling_blip.py b/src/transformers/models/blip/modeling_blip.py index 115aa14e8..cb6a78639 100644 --- a/src/transformers/models/blip/modeling_blip.py +++ b/src/transformers/models/blip/modeling_blip.py @@ -743,7 +743,7 @@ class BlipModel(BlipPreTrainedModel): self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False) self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False) - self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value) + self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value)) # Initialize weights and apply final processing self.post_init() diff --git a/src/transformers/models/bridgetower/modeling_bridgetower.py b/src/transformers/models/bridgetower/modeling_bridgetower.py index 42b31c964..080c13f10 100644 --- a/src/transformers/models/bridgetower/modeling_bridgetower.py +++ b/src/transformers/models/bridgetower/modeling_bridgetower.py @@ -1778,7 +1778,7 @@ class BridgeTowerForContrastiveLearning(BridgeTowerPreTrainedModel): self.itc_image_head = BridgeTowerContrastiveHead(config.hidden_size, config.contrastive_hidden_size) self.itc_cross_modal_head = BridgeTowerContrastiveHead(config.hidden_size * 2, config.contrastive_hidden_size) - self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value) + self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value)) # Initialize weights and apply final processing self.post_init() diff --git a/src/transformers/models/chinese_clip/modeling_chinese_clip.py b/src/transformers/models/chinese_clip/modeling_chinese_clip.py index 86da1c7b6..32159c5ab 100644 --- a/src/transformers/models/chinese_clip/modeling_chinese_clip.py +++ b/src/transformers/models/chinese_clip/modeling_chinese_clip.py @@ -1376,7 +1376,7 @@ class ChineseCLIPModel(ChineseCLIPPreTrainedModel): self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False) self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False) - self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value) + self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value)) # Initialize weights and apply final processing self.post_init() diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py index c5533f3da..d0d99c1a1 100644 --- a/src/transformers/models/clap/modeling_clap.py +++ b/src/transformers/models/clap/modeling_clap.py @@ -1956,8 +1956,8 @@ class ClapModel(ClapPreTrainedModel): text_config = config.text_config audio_config = config.audio_config - self.logit_scale_a = nn.Parameter(torch.ones([]) * np.log(config.logit_scale_init_value)) - self.logit_scale_t = nn.Parameter(torch.ones([]) * np.log(config.logit_scale_init_value)) + self.logit_scale_a = nn.Parameter(torch.tensor(np.log(config.logit_scale_init_value))) + self.logit_scale_t = nn.Parameter(torch.tensor(np.log(config.logit_scale_init_value))) self.projection_dim = config.projection_dim diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py index 487f756d3..292f932d9 100644 --- a/src/transformers/models/clip/modeling_clip.py +++ b/src/transformers/models/clip/modeling_clip.py @@ -977,7 +977,7 @@ class CLIPModel(CLIPPreTrainedModel): self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False) self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False) - self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value) + self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value)) # Initialize weights and apply final processing self.post_init() diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py index b1d120e36..f15553763 100644 --- a/src/transformers/models/clipseg/modeling_clipseg.py +++ b/src/transformers/models/clipseg/modeling_clipseg.py @@ -979,7 +979,7 @@ class CLIPSegModel(CLIPSegPreTrainedModel): self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False) self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False) - self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value) + self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value)) # Initialize weights and apply final processing self.post_init() diff --git a/src/transformers/models/flava/modeling_flava.py b/src/transformers/models/flava/modeling_flava.py index d986a17b7..9e106e3c2 100644 --- a/src/transformers/models/flava/modeling_flava.py +++ b/src/transformers/models/flava/modeling_flava.py @@ -1229,7 +1229,7 @@ class FlavaModel(FlavaPreTrainedModel): self.image_projection = nn.Linear(self.image_hidden_size, self.projection_dim) self.text_projection = nn.Linear(self.text_hidden_size, self.projection_dim) - self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value) + self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value)) self.image_to_mm_projection = nn.Linear(self.image_hidden_size, self.mm_hidden_size) self.text_to_mm_projection = nn.Linear(self.text_hidden_size, self.mm_hidden_size) diff --git a/src/transformers/models/groupvit/modeling_groupvit.py b/src/transformers/models/groupvit/modeling_groupvit.py index 9c312c0ff..41fe81271 100644 --- a/src/transformers/models/groupvit/modeling_groupvit.py +++ b/src/transformers/models/groupvit/modeling_groupvit.py @@ -1368,7 +1368,7 @@ class GroupViTModel(GroupViTPreTrainedModel): nn.ReLU(inplace=True), nn.Linear(self.projection_intermediate_dim, self.projection_dim, bias=True), ) - self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value) + self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value)) # Initialize weights and apply final processing self.post_init() diff --git a/src/transformers/models/oneformer/modeling_oneformer.py b/src/transformers/models/oneformer/modeling_oneformer.py index a874611ac..06d647095 100644 --- a/src/transformers/models/oneformer/modeling_oneformer.py +++ b/src/transformers/models/oneformer/modeling_oneformer.py @@ -399,7 +399,7 @@ class OneFormerLoss(nn.Module): self.importance_sample_ratio = importance_sample_ratio self.contrastive_temperature = contrastive_temperature if self.contrastive_temperature is not None: - self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / contrastive_temperature)) + self.logit_scale = nn.Parameter(torch.tensor(np.log(1 / contrastive_temperature))) def _max_by_axis(self, the_list: List[List[int]]) -> List[int]: maxes = the_list[0] diff --git a/src/transformers/models/owlvit/modeling_owlvit.py b/src/transformers/models/owlvit/modeling_owlvit.py index 9984cc516..a5880962b 100644 --- a/src/transformers/models/owlvit/modeling_owlvit.py +++ b/src/transformers/models/owlvit/modeling_owlvit.py @@ -1065,7 +1065,7 @@ class OwlViTModel(OwlViTPreTrainedModel): self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False) self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False) - self.logit_scale = nn.Parameter(torch.ones([]) * config.logit_scale_init_value) + self.logit_scale = nn.Parameter(torch.tensor(config.logit_scale_init_value)) # Initialize weights and apply final processing self.post_init() diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py index 9a65ec02e..106ff462e 100755 --- a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +++ b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py @@ -204,7 +204,7 @@ class VisionTextDualEncoderModel(PreTrainedModel): self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False) self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False) - self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value) + self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value)) @add_start_docstrings_to_model_forward(VISION_TEXT_DUAL_ENCODER_TEXT_INPUTS_DOCSTRING) def get_text_features( diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py index bcf91b0b5..248a9248a 100644 --- a/src/transformers/models/x_clip/modeling_x_clip.py +++ b/src/transformers/models/x_clip/modeling_x_clip.py @@ -1309,7 +1309,7 @@ class XCLIPModel(XCLIPPreTrainedModel): self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False) self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False) - self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value) + self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value)) self.prompts_visual_layernorm = nn.LayerNorm(self.vision_embed_dim, eps=config.vision_config.layer_norm_eps) self.prompts_visual_projection = nn.Parameter(torch.randn(self.vision_embed_dim, self.projection_dim))