diff --git a/src/transformers/models/phi/modeling_phi.py b/src/transformers/models/phi/modeling_phi.py index 08d9eddd9..c81fbbd01 100644 --- a/src/transformers/models/phi/modeling_phi.py +++ b/src/transformers/models/phi/modeling_phi.py @@ -727,7 +727,7 @@ class PhiForCausalLM(PhiPreTrainedModel, GenerationMixin): super().__init__(config) self.model = PhiModel(config) self.vocab_size = config.vocab_size - self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=True) # Initialize weights and apply final processing self.post_init() diff --git a/src/transformers/models/phi/modular_phi.py b/src/transformers/models/phi/modular_phi.py index 0faa4629f..d8480a7ad 100644 --- a/src/transformers/models/phi/modular_phi.py +++ b/src/transformers/models/phi/modular_phi.py @@ -284,7 +284,9 @@ class PhiModel(LlamaModel): class PhiForCausalLM(LlamaForCausalLM): - pass + def __init__(self, config): + super().__init__(config) + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=True) class PhiForSequenceClassification(LlamaForSequenceClassification):