Make inductor_utils.requires_gpu accept MPS (#145156)

Not yet ready to setp HAS_GPU to true, but can unskip tests that require GPU (Noticed while running test_mps_basics.py that `test_scalar_cpu_tensor_arg` is getting skipped) - Replace `GPU_TYPE` with `self.device` in `test_custom_op_fixed_layout_sequential`, `test_inductor_layout_optimization_input_mutations`, `test_mutable_custom_op_fixed_layout2` otherwise they GPU tests are just running for _cpu suffixes. - Tweak `test_tmp_not_defined_issue3` to work correctly on CPU, by defining `test_device` and `test_device_0` - UnXFail `test_mutable_custom_op_fixed_layout2_dynamic_shapes` as it should just work on CPU - Add `skip_if_no_triton` decorator and decorate `test_reduction_config_limit` with it, as it does not need CPU nor GPU, but rather a triton backend. Pull Request resolved: https://github.com/pytorch/pytorch/pull/145156 Approved by: https://github.com/dcci, https://github.com/Skylion007, https://github.com/jansel
2026-05-14 20:57:59 +00:00 · 2025-02-05 11:49:30 -08:00 · 2025-02-05 11:49:30 -08:00 · 6a985d8b2e
commit 6a985d8b2e
parent 0dc03134d9
6 changed files with 50 additions and 18 deletions
--- a/test/inductor/test_custom_lowering.py
+++ b/test/inductor/test_custom_lowering.py
@ -1,6 +1,7 @@
 # Owner(s): ["module: inductor"]

 from functools import partial
+from unittest import skipIf

 import torch
 from torch._inductor.ir import Pointwise
@ -140,6 +141,7 @@ class TestCustomLowering(InductorTestCase):
        )(add_custom_lowering)

    @requires_gpu()
+    @skipIf(GPU_TYPE == "mps", "Not applicable to MPS")
    def test_jagged_to_padded_dense_sanity_cuda(self):
        def fn(inp, offsets, max_seq_len):
            return torch.ops.test_inductor_ops.jagged_to_padded_dense(
@ -165,6 +167,7 @@ class TestCustomLowering(InductorTestCase):
        )

    @requires_gpu()
+    @skipIf(GPU_TYPE == "mps", "Not applicable to MPS")
    def test_jagged_to_padded_dense_zero_size(self):
        # Previously, the masking was being completely stripped for the
        # masked load of the input value. That would lead to an IMA
@ -188,6 +191,7 @@ class TestCustomLowering(InductorTestCase):
    @requires_gpu()
    @skipIfRocm
    @skipIfXpu
+    @skipIf(GPU_TYPE == "mps", "Not applicable to MPS")
    def test_tanh_approx(self):
        def fn(inp):
            return torch.ops.test_inductor_ops.tanh_approx(inp)
@ -202,6 +206,7 @@ class TestCustomLowering(InductorTestCase):
    @requires_gpu()
    @skipIfRocm
    @skipIfXpu
+    @skipIf(GPU_TYPE == "mps", "Not applicable to MPS")
    def test_multi_inp_asm(self):
        def fn(a, b):
            return torch.ops.test_inductor_ops.add_custom(a, b)
--- a/test/inductor/test_group_batch_fusion.py
+++ b/test/inductor/test_group_batch_fusion.py
@ -394,6 +394,7 @@ class TestGroupBatchFusion(TestCase):
        )
        counters.clear()

+    @unittest.skipIf(GPU_TYPE == "mps", "welford_reduce is yet not implemented for MPS")
    def test_batch_layer_norm_fusion(self):
        for has_weight in [True, False]:
            for has_bias in [True, False]:
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@ -806,6 +806,16 @@ def skip_if_triton(fn):
    return wrapper


+def skip_if_not_triton(fn):
+    @functools.wraps(fn)
+    def wrapper(self):
+        if not is_triton_backend(self.device):
+            raise unittest.SkipTest(f"triton backend is required for {self.device}")
+        return fn(self)
+
+    return wrapper
+
+
 def skip_if_dynamic(fn):
    @functools.wraps(fn)
    def wrapper(self):
@ -827,8 +837,11 @@ def is_mps_backend(device):


 def is_triton_backend(device):
-    if getattr(device, "type", device) == "cpu":
+    device_type = getattr(device, "type", device)
+    if device_type == "cpu":
        return config.cpu_backend == "triton"
+    if device_type == "mps":
+        return False
    return config.cuda_backend == "triton"


@ -1888,7 +1901,7 @@ class CommonTemplate:

        self.common(fn, (torch.full((4,), float("-inf")),))

-    @requires_gpu()
+    @skip_if_not_triton
    def test_reduction_config_limit(self):
        """
        This unit-test tests whether we exceed cudaDeviceProperties.maxGridSize in
@ -9634,7 +9647,12 @@ class CommonTemplate:
    @requires_gpu()
    @skip_if_halide  # cascading accuracy issues due rsqrt fallback
    def test_tmp_not_defined_issue3(self):
-        from torch import device
+        test_device = torch.device(type=self.device)
+        test_device_0 = (
+            torch.device(type=self.device, index=0)
+            if self.device != "cpu"
+            else test_device
+        )

        def forward(
            self,
@ -9678,7 +9696,7 @@ class CommonTemplate:
                1,
                dtype=torch.int32,
                layout=torch.strided,
-                device=device(type=GPU_TYPE, index=0),
+                device=test_device_0,
                pin_memory=False,
            )

@ -9687,7 +9705,7 @@ class CommonTemplate:
                start=0,
                step=1,
                dtype=torch.int32,
-                device=device(type=GPU_TYPE),
+                device=test_device,
                requires_grad=False,
            )

@ -9697,7 +9715,7 @@ class CommonTemplate:
                start=0,
                step=1001,
                dtype=torch.int32,
-                device=device(type=GPU_TYPE, index=0),
+                device=test_device_0,
                requires_grad=False,
            )
            view: "i32[6150144]" = torch.ops.aten.reshape.default(mul, [-1])
@ -9744,7 +9762,7 @@ class CommonTemplate:
                permute_1,
            ]

-        kwargs = aot_graph_input_parser(forward, device=GPU_TYPE)
+        kwargs = aot_graph_input_parser(forward, device=self.device)
        self.common(forward, [], kwargs=kwargs)

    @skip_if_gpu_halide
@ -10057,7 +10075,7 @@ class CommonTemplate:
    @tf32_on_and_off(0.005)
    def test_inductor_layout_optimization_input_mutations(self):
        # channel dim must be > 64 for inductor to do layout optimization and use NHWC
-        mod = nn.Conv2d(3, 128, 1, stride=1, bias=False).to(GPU_TYPE)
+        mod = nn.Conv2d(3, 128, 1, stride=1, bias=False).to(self.device)

        def f(x):
            x.mul_(2)
@ -10065,7 +10083,7 @@ class CommonTemplate:
            return out

        f_compiled = torch.compile(f)
-        x_ref = torch.rand(2, 3, 128, 128, device=GPU_TYPE)
+        x_ref = torch.rand(2, 3, 128, 128, device=self.device)
        x_test = x_ref.detach().clone()
        with torch.no_grad():
            out_ref = f(x_ref)
@ -11403,8 +11421,8 @@ class CommonTemplate:
    def test_custom_op_fixed_layout_sequential(self):
        import torch.library

-        mod = nn.Conv2d(3, 128, 1, stride=1, bias=False).to(device=GPU_TYPE)
-        inp = torch.rand(2, 3, 128, 128, device=GPU_TYPE)
+        mod = nn.Conv2d(3, 128, 1, stride=1, bias=False).to(device=self.device)
+        inp = torch.rand(2, 3, 128, 128, device=self.device)
        expected_stride = mod(inp).stride()

        def bar(x):
@ -11440,8 +11458,8 @@ class CommonTemplate:
    @tf32_on_and_off(0.005)
    def test_mutable_custom_op_fixed_layout2(self):
        with torch.library._scoped_library("mylib", "DEF") as lib:
-            mod = nn.Conv2d(3, 128, 1, stride=1, bias=False).to(device=GPU_TYPE)
-            inp = torch.rand(2, 3, 128, 128, device=GPU_TYPE)
+            mod = nn.Conv2d(3, 128, 1, stride=1, bias=False).to(device=self.device)
+            inp = torch.rand(2, 3, 128, 128, device=self.device)
            expected_stride = mod(inp).clone().stride()

            lib.define(
@ -11841,7 +11859,7 @@ class CommonTemplate:
        for cpu_dtype in test_dtypes:
            if not self.is_dtype_supported(cpu_dtype):
                continue
-            x = torch.rand([20], device=GPU_TYPE)
+            x = torch.rand([20], device=self.device)
            y = torch.rand([4], device="cpu", dtype=cpu_dtype)
            self.common(
                fn,
--- a/test/inductor/test_torchinductor_codegen_config_overrides.py
+++ b/test/inductor/test_torchinductor_codegen_config_overrides.py
@ -1,6 +1,7 @@
 # Owner(s): ["module: inductor"]
 import importlib
 from typing import Any, Callable, Optional
+from unittest import skipIf

 import torch
 import torch.utils._pytree as pytree
@ -83,6 +84,7 @@ class CodegenInductorTest(InductorTestCase):
            self.count_code("= reinterpret_tensor(", code, 2)

    @requires_gpu()
+    @skipIf(GPU_TYPE == "mps", "Triton is not available for MPS")
    def test_kernel_fusion_thresholds(self):
        def func(a, b):
            tmp0 = a + 1
--- a/test/inductor/test_torchinductor_codegen_dynamic_shapes.py
+++ b/test/inductor/test_torchinductor_codegen_dynamic_shapes.py
@ -6,7 +6,11 @@ import sys
 import torch
 from torch._inductor.compile_fx import compile_fx
 from torch._inductor.test_case import TestCase
-from torch.testing._internal.common_utils import TEST_WITH_ASAN, TEST_WITH_ROCM
+from torch.testing._internal.common_utils import (
+    IS_LINUX,
+    TEST_WITH_ASAN,
+    TEST_WITH_ROCM,
+)
 from torch.testing._internal.inductor_utils import (
    _check_has_dynamic_shape,
    GPU_TYPE,
@ -110,7 +114,6 @@ test_failures = {
    "test_conv2d_channels_last_dynamic_shapes": TestFailure(("cpu",)),
    "test_conv3d_dynamic_shapes": TestFailure(("cpu",)),
    "test_conv3d_channels_last_dynamic_shapes": TestFailure(("cpu",)),
-    "test_mutable_custom_op_fixed_layout2_dynamic_shapes": TestFailure(("cpu",)),
    "test_expand_dynamic_shapes": TestFailure(("cpu",)),
    "test_full_boolean_dynamic_shapes": TestFailure(("cpu",)),
    "test_glu_dynamic_shapes": TestFailure(("cpu",)),
@ -258,7 +261,7 @@ test_failures = {
    "test_zero_element_mutation_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
    "test_custom_op_3_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
    "test_custom_op_fixed_layout_sequential_dynamic_shapes": TestFailure(
-        ("cpu", "cuda", "xpu")
+        ("cuda", "xpu") if IS_LINUX else ("cpu", "cuda", "xpu")
    ),
    "test_cat_uint8_dynamic_shapes": TestFailure(
        ("cpu",)
--- a/torch/testing/_internal/inductor_utils.py
+++ b/torch/testing/_internal/inductor_utils.py
@ -52,6 +52,8 @@ HAS_CUDA = torch.cuda.is_available() and HAS_TRITON

 HAS_XPU = torch.xpu.is_available() and HAS_TRITON

+HAS_MPS = torch.mps.is_available()
+
 HAS_GPU = HAS_CUDA or HAS_XPU

 GPU_TYPE = get_gpu_type()
@ -110,7 +112,8 @@ def skip_windows_ci(name: str, file: str) -> None:
            sys.exit(0)
        raise unittest.SkipTest("requires sympy/functorch/filelock")

-requires_gpu = functools.partial(unittest.skipIf, not HAS_GPU, "requires gpu")
+# TODO: Remove HAS_MPS condition  when `HAS_GPU` includes HAS_MPS
+requires_gpu = functools.partial(unittest.skipIf, not (HAS_GPU or HAS_MPS), "requires gpu")
 requires_triton = functools.partial(unittest.skipIf, not HAS_TRITON, "requires triton")

 def requires_cuda_with_enough_memory(min_mem_required):