diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py index 92a5bd43b..93f0e96ab 100644 --- a/src/transformers/modeling_bart.py +++ b/src/transformers/modeling_bart.py @@ -1040,7 +1040,7 @@ class BartForConditionalGeneration(PretrainedBartModel): masked_lm_loss = None if labels is not None: - loss_fct = nn.CrossEntropyLoss() + loss_fct = CrossEntropyLoss() # TODO(SS): do we need to ignore pad tokens in labels? masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1)) @@ -1179,7 +1179,8 @@ class BartForSequenceClassification(PretrainedBartModel): loss = None if labels is not None: - loss = F.cross_entropy(logits.view(-1, self.config.num_labels), labels.view(-1)) + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1)) if not return_dict: output = (logits,) + outputs[1:]