Make inductor_utils.requires_gpu accept MPS (#145156)

Not yet ready to setp HAS_GPU to true, but can unskip tests that require GPU
(Noticed while running test_mps_basics.py that `test_scalar_cpu_tensor_arg` is getting skipped)

- Replace `GPU_TYPE` with `self.device` in `test_custom_op_fixed_layout_sequential`, `test_inductor_layout_optimization_input_mutations`, `test_mutable_custom_op_fixed_layout2`  otherwise they GPU tests are just running for _cpu suffixes.
- Tweak `test_tmp_not_defined_issue3` to work correctly on CPU, by defining `test_device` and `test_device_0`
- UnXFail `test_mutable_custom_op_fixed_layout2_dynamic_shapes` as it should just work on CPU
- Add `skip_if_no_triton` decorator and decorate `test_reduction_config_limit` with it, as it does not need CPU nor GPU, but rather a triton backend.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/145156
Approved by: https://github.com/dcci, https://github.com/Skylion007, https://github.com/jansel
This commit is contained in:
Nikita Shulga 2025-02-05 11:49:30 -08:00 committed by PyTorch MergeBot
parent 0dc03134d9
commit 6a985d8b2e
6 changed files with 50 additions and 18 deletions

View file

@ -1,6 +1,7 @@
# Owner(s): ["module: inductor"]
from functools import partial
from unittest import skipIf
import torch
from torch._inductor.ir import Pointwise
@ -140,6 +141,7 @@ class TestCustomLowering(InductorTestCase):
)(add_custom_lowering)
@requires_gpu()
@skipIf(GPU_TYPE == "mps", "Not applicable to MPS")
def test_jagged_to_padded_dense_sanity_cuda(self):
def fn(inp, offsets, max_seq_len):
return torch.ops.test_inductor_ops.jagged_to_padded_dense(
@ -165,6 +167,7 @@ class TestCustomLowering(InductorTestCase):
)
@requires_gpu()
@skipIf(GPU_TYPE == "mps", "Not applicable to MPS")
def test_jagged_to_padded_dense_zero_size(self):
# Previously, the masking was being completely stripped for the
# masked load of the input value. That would lead to an IMA
@ -188,6 +191,7 @@ class TestCustomLowering(InductorTestCase):
@requires_gpu()
@skipIfRocm
@skipIfXpu
@skipIf(GPU_TYPE == "mps", "Not applicable to MPS")
def test_tanh_approx(self):
def fn(inp):
return torch.ops.test_inductor_ops.tanh_approx(inp)
@ -202,6 +206,7 @@ class TestCustomLowering(InductorTestCase):
@requires_gpu()
@skipIfRocm
@skipIfXpu
@skipIf(GPU_TYPE == "mps", "Not applicable to MPS")
def test_multi_inp_asm(self):
def fn(a, b):
return torch.ops.test_inductor_ops.add_custom(a, b)

View file

@ -394,6 +394,7 @@ class TestGroupBatchFusion(TestCase):
)
counters.clear()
@unittest.skipIf(GPU_TYPE == "mps", "welford_reduce is yet not implemented for MPS")
def test_batch_layer_norm_fusion(self):
for has_weight in [True, False]:
for has_bias in [True, False]:

View file

@ -806,6 +806,16 @@ def skip_if_triton(fn):
return wrapper
def skip_if_not_triton(fn):
@functools.wraps(fn)
def wrapper(self):
if not is_triton_backend(self.device):
raise unittest.SkipTest(f"triton backend is required for {self.device}")
return fn(self)
return wrapper
def skip_if_dynamic(fn):
@functools.wraps(fn)
def wrapper(self):
@ -827,8 +837,11 @@ def is_mps_backend(device):
def is_triton_backend(device):
if getattr(device, "type", device) == "cpu":
device_type = getattr(device, "type", device)
if device_type == "cpu":
return config.cpu_backend == "triton"
if device_type == "mps":
return False
return config.cuda_backend == "triton"
@ -1888,7 +1901,7 @@ class CommonTemplate:
self.common(fn, (torch.full((4,), float("-inf")),))
@requires_gpu()
@skip_if_not_triton
def test_reduction_config_limit(self):
"""
This unit-test tests whether we exceed cudaDeviceProperties.maxGridSize in
@ -9634,7 +9647,12 @@ class CommonTemplate:
@requires_gpu()
@skip_if_halide # cascading accuracy issues due rsqrt fallback
def test_tmp_not_defined_issue3(self):
from torch import device
test_device = torch.device(type=self.device)
test_device_0 = (
torch.device(type=self.device, index=0)
if self.device != "cpu"
else test_device
)
def forward(
self,
@ -9678,7 +9696,7 @@ class CommonTemplate:
1,
dtype=torch.int32,
layout=torch.strided,
device=device(type=GPU_TYPE, index=0),
device=test_device_0,
pin_memory=False,
)
@ -9687,7 +9705,7 @@ class CommonTemplate:
start=0,
step=1,
dtype=torch.int32,
device=device(type=GPU_TYPE),
device=test_device,
requires_grad=False,
)
@ -9697,7 +9715,7 @@ class CommonTemplate:
start=0,
step=1001,
dtype=torch.int32,
device=device(type=GPU_TYPE, index=0),
device=test_device_0,
requires_grad=False,
)
view: "i32[6150144]" = torch.ops.aten.reshape.default(mul, [-1])
@ -9744,7 +9762,7 @@ class CommonTemplate:
permute_1,
]
kwargs = aot_graph_input_parser(forward, device=GPU_TYPE)
kwargs = aot_graph_input_parser(forward, device=self.device)
self.common(forward, [], kwargs=kwargs)
@skip_if_gpu_halide
@ -10057,7 +10075,7 @@ class CommonTemplate:
@tf32_on_and_off(0.005)
def test_inductor_layout_optimization_input_mutations(self):
# channel dim must be > 64 for inductor to do layout optimization and use NHWC
mod = nn.Conv2d(3, 128, 1, stride=1, bias=False).to(GPU_TYPE)
mod = nn.Conv2d(3, 128, 1, stride=1, bias=False).to(self.device)
def f(x):
x.mul_(2)
@ -10065,7 +10083,7 @@ class CommonTemplate:
return out
f_compiled = torch.compile(f)
x_ref = torch.rand(2, 3, 128, 128, device=GPU_TYPE)
x_ref = torch.rand(2, 3, 128, 128, device=self.device)
x_test = x_ref.detach().clone()
with torch.no_grad():
out_ref = f(x_ref)
@ -11403,8 +11421,8 @@ class CommonTemplate:
def test_custom_op_fixed_layout_sequential(self):
import torch.library
mod = nn.Conv2d(3, 128, 1, stride=1, bias=False).to(device=GPU_TYPE)
inp = torch.rand(2, 3, 128, 128, device=GPU_TYPE)
mod = nn.Conv2d(3, 128, 1, stride=1, bias=False).to(device=self.device)
inp = torch.rand(2, 3, 128, 128, device=self.device)
expected_stride = mod(inp).stride()
def bar(x):
@ -11440,8 +11458,8 @@ class CommonTemplate:
@tf32_on_and_off(0.005)
def test_mutable_custom_op_fixed_layout2(self):
with torch.library._scoped_library("mylib", "DEF") as lib:
mod = nn.Conv2d(3, 128, 1, stride=1, bias=False).to(device=GPU_TYPE)
inp = torch.rand(2, 3, 128, 128, device=GPU_TYPE)
mod = nn.Conv2d(3, 128, 1, stride=1, bias=False).to(device=self.device)
inp = torch.rand(2, 3, 128, 128, device=self.device)
expected_stride = mod(inp).clone().stride()
lib.define(
@ -11841,7 +11859,7 @@ class CommonTemplate:
for cpu_dtype in test_dtypes:
if not self.is_dtype_supported(cpu_dtype):
continue
x = torch.rand([20], device=GPU_TYPE)
x = torch.rand([20], device=self.device)
y = torch.rand([4], device="cpu", dtype=cpu_dtype)
self.common(
fn,

View file

@ -1,6 +1,7 @@
# Owner(s): ["module: inductor"]
import importlib
from typing import Any, Callable, Optional
from unittest import skipIf
import torch
import torch.utils._pytree as pytree
@ -83,6 +84,7 @@ class CodegenInductorTest(InductorTestCase):
self.count_code("= reinterpret_tensor(", code, 2)
@requires_gpu()
@skipIf(GPU_TYPE == "mps", "Triton is not available for MPS")
def test_kernel_fusion_thresholds(self):
def func(a, b):
tmp0 = a + 1

View file

@ -6,7 +6,11 @@ import sys
import torch
from torch._inductor.compile_fx import compile_fx
from torch._inductor.test_case import TestCase
from torch.testing._internal.common_utils import TEST_WITH_ASAN, TEST_WITH_ROCM
from torch.testing._internal.common_utils import (
IS_LINUX,
TEST_WITH_ASAN,
TEST_WITH_ROCM,
)
from torch.testing._internal.inductor_utils import (
_check_has_dynamic_shape,
GPU_TYPE,
@ -110,7 +114,6 @@ test_failures = {
"test_conv2d_channels_last_dynamic_shapes": TestFailure(("cpu",)),
"test_conv3d_dynamic_shapes": TestFailure(("cpu",)),
"test_conv3d_channels_last_dynamic_shapes": TestFailure(("cpu",)),
"test_mutable_custom_op_fixed_layout2_dynamic_shapes": TestFailure(("cpu",)),
"test_expand_dynamic_shapes": TestFailure(("cpu",)),
"test_full_boolean_dynamic_shapes": TestFailure(("cpu",)),
"test_glu_dynamic_shapes": TestFailure(("cpu",)),
@ -258,7 +261,7 @@ test_failures = {
"test_zero_element_mutation_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
"test_custom_op_3_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
"test_custom_op_fixed_layout_sequential_dynamic_shapes": TestFailure(
("cpu", "cuda", "xpu")
("cuda", "xpu") if IS_LINUX else ("cpu", "cuda", "xpu")
),
"test_cat_uint8_dynamic_shapes": TestFailure(
("cpu",)

View file

@ -52,6 +52,8 @@ HAS_CUDA = torch.cuda.is_available() and HAS_TRITON
HAS_XPU = torch.xpu.is_available() and HAS_TRITON
HAS_MPS = torch.mps.is_available()
HAS_GPU = HAS_CUDA or HAS_XPU
GPU_TYPE = get_gpu_type()
@ -110,7 +112,8 @@ def skip_windows_ci(name: str, file: str) -> None:
sys.exit(0)
raise unittest.SkipTest("requires sympy/functorch/filelock")
requires_gpu = functools.partial(unittest.skipIf, not HAS_GPU, "requires gpu")
# TODO: Remove HAS_MPS condition when `HAS_GPU` includes HAS_MPS
requires_gpu = functools.partial(unittest.skipIf, not (HAS_GPU or HAS_MPS), "requires gpu")
requires_triton = functools.partial(unittest.skipIf, not HAS_TRITON, "requires triton")
def requires_cuda_with_enough_memory(min_mem_required):