mirror of
https://github.com/saymrwulf/transformers.git
synced 2026-05-14 20:58:08 +00:00
Add XPU type for work-around -inf mask causing sdpa NaN issue in modeling files (#35647)
* add xpu for unmask * change modular for generated matching * add lastest modeling for helium
This commit is contained in:
parent
d8080d55c7
commit
315a9f494e
60 changed files with 61 additions and 61 deletions
|
|
@ -639,7 +639,7 @@ class DummyModel(DummyPreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -639,7 +639,7 @@ class Multimodal1TextModel(Multimodal1TextPreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -644,7 +644,7 @@ class MyNewModel2Model(MyNewModel2PreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -561,7 +561,7 @@ class SuperModel(SuperPreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -1056,7 +1056,7 @@ class AriaTextModel(AriaTextPreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -1360,7 +1360,7 @@ class BambaModel(BambaPreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -1101,7 +1101,7 @@ class BambaModel(BambaPreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -785,7 +785,7 @@ class BloomModel(BloomPreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -1431,7 +1431,7 @@ class ChameleonModel(ChameleonPreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -628,7 +628,7 @@ class CodeGenModel(CodeGenPreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -706,7 +706,7 @@ class CohereModel(CoherePreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -1159,7 +1159,7 @@ class DbrxModel(DbrxPreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -945,7 +945,7 @@ class DiffLlamaModel(DiffLlamaPreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -1524,7 +1524,7 @@ class Emu3TextModel(Emu3PreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -1088,7 +1088,7 @@ class FalconModel(FalconPreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -677,7 +677,7 @@ class GemmaModel(GemmaPreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -687,7 +687,7 @@ class GlmModel(GlmPreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -927,7 +927,7 @@ class GPTBigCodeModel(GPTBigCodePreTrainedModel):
|
|||
# [batch_size, target_length, 1, source_length], not compatible with SDPA, hence this transpose.
|
||||
self_attention_mask = self_attention_mask.transpose(1, 2)
|
||||
|
||||
if query_length > 1 and attention_mask is not None and attention_mask.device.type == "cuda":
|
||||
if query_length > 1 and attention_mask is not None and attention_mask.device.type in ["cuda", "xpu"]:
|
||||
# From PyTorch 2.1 onwards, F.scaled_dot_product_attention with the memory-efficient attention backend
|
||||
# produces nans if sequences are completely unattended in the attention mask. Details: https://github.com/pytorch/pytorch/issues/110213
|
||||
self_attention_mask = AttentionMaskConverter._unmask_unattended(
|
||||
|
|
|
|||
|
|
@ -837,7 +837,7 @@ class GPTNeoModel(GPTNeoPreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -681,7 +681,7 @@ class GPTNeoXModel(GPTNeoXPreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -708,7 +708,7 @@ class GPTNeoXJapaneseModel(GPTNeoXJapanesePreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -936,7 +936,7 @@ class GPTJModel(GPTJPreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -690,7 +690,7 @@ class GraniteModel(GranitePreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -1177,7 +1177,7 @@ class GraniteMoeModel(GraniteMoePreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -674,7 +674,7 @@ class HeliumModel(HeliumPreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -1408,7 +1408,7 @@ class IdeficsModel(IdeficsPreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -1381,7 +1381,7 @@ class JambaModel(JambaPreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
# using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
|
||||
|
|
|
|||
|
|
@ -1170,7 +1170,7 @@ class JetMoeModel(JetMoePreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -676,7 +676,7 @@ class LlamaModel(LlamaPreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -1645,7 +1645,7 @@ class LongT5Stack(LongT5PreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -1133,7 +1133,7 @@ class MimiTransformerModel(nn.Module):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -667,7 +667,7 @@ class MistralModel(MistralPreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -185,7 +185,7 @@ class MistralModel(LlamaModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -801,7 +801,7 @@ class MixtralModel(MixtralPreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -1123,7 +1123,7 @@ class MllamaPreTrainedModel(PreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -1040,7 +1040,7 @@ class MoonshineDecoder(MoonshinePreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -1363,7 +1363,7 @@ class MoshiDepthDecoder(MoshiPreTrainedModel, GenerationMixin):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
@ -1675,7 +1675,7 @@ class MoshiModel(MoshiPreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -1237,7 +1237,7 @@ class MT5Stack(MT5PreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -924,7 +924,7 @@ class NemotronModel(NemotronPreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -652,7 +652,7 @@ class OlmoModel(OlmoPreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -653,7 +653,7 @@ class Olmo2Model(Olmo2PreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -1084,7 +1084,7 @@ class OlmoeModel(OlmoePreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -725,7 +725,7 @@ class PersimmonModel(PersimmonPreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -650,7 +650,7 @@ class PhiModel(PhiPreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -737,7 +737,7 @@ class Phi3Model(Phi3PreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -1247,7 +1247,7 @@ class PhimoeModel(PhimoePreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -1632,7 +1632,7 @@ class Pix2StructTextModel(Pix2StructPreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -1045,7 +1045,7 @@ class Pop2PianoStack(Pop2PianoPreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -661,7 +661,7 @@ class Qwen2Model(Qwen2PreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -1135,7 +1135,7 @@ class Qwen2MoeModel(Qwen2MoePreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -1202,7 +1202,7 @@ class Qwen2VLModel(Qwen2VLPreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -765,7 +765,7 @@ class RecurrentGemmaModel(RecurrentGemmaPreTrainedModel):
|
|||
padding_mask = causal_mask[..., :mask_length].eq(0.0) * attention_mask[:, None, None, :].eq(0.0)
|
||||
causal_mask[..., :mask_length] = causal_mask[..., :mask_length].masked_fill(padding_mask, min_dtype)
|
||||
|
||||
if attention_mask is not None and attention_mask.device.type == "cuda":
|
||||
if attention_mask is not None and attention_mask.device.type in ["cuda", "xpu"]:
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
# using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
|
||||
# Details: https://github.com/pytorch/pytorch/issues/110213
|
||||
|
|
|
|||
|
|
@ -980,7 +980,7 @@ class StableLmModel(StableLmPreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -663,7 +663,7 @@ class Starcoder2Model(Starcoder2PreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -1181,7 +1181,7 @@ class SwitchTransformersStack(SwitchTransformersPreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -1250,7 +1250,7 @@ class T5Stack(T5PreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -1583,7 +1583,7 @@ class UdopStack(UdopPreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -894,7 +894,7 @@ class UMT5Stack(UMT5PreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -1420,7 +1420,7 @@ class WhisperDecoder(WhisperPreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
and not output_attentions
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
|
|
|
|||
|
|
@ -1168,7 +1168,7 @@ class ZambaModel(ZambaPreTrainedModel):
|
|||
if (
|
||||
self.config._attn_implementation == "sdpa"
|
||||
and attention_mask is not None
|
||||
and attention_mask.device.type == "cuda"
|
||||
and attention_mask.device.type in ["cuda", "xpu"]
|
||||
):
|
||||
# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
|
||||
# using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
|
||||
|
|
|
|||
Loading…
Reference in a new issue