diff --git a/test/inductor/test_custom_lowering.py b/test/inductor/test_custom_lowering.py index 17eb27ef4ec..4786a97429e 100644 --- a/test/inductor/test_custom_lowering.py +++ b/test/inductor/test_custom_lowering.py @@ -1,6 +1,7 @@ # Owner(s): ["module: inductor"] from functools import partial +from unittest import skipIf import torch from torch._inductor.ir import Pointwise @@ -140,6 +141,7 @@ class TestCustomLowering(InductorTestCase): )(add_custom_lowering) @requires_gpu() + @skipIf(GPU_TYPE == "mps", "Not applicable to MPS") def test_jagged_to_padded_dense_sanity_cuda(self): def fn(inp, offsets, max_seq_len): return torch.ops.test_inductor_ops.jagged_to_padded_dense( @@ -165,6 +167,7 @@ class TestCustomLowering(InductorTestCase): ) @requires_gpu() + @skipIf(GPU_TYPE == "mps", "Not applicable to MPS") def test_jagged_to_padded_dense_zero_size(self): # Previously, the masking was being completely stripped for the # masked load of the input value. That would lead to an IMA @@ -188,6 +191,7 @@ class TestCustomLowering(InductorTestCase): @requires_gpu() @skipIfRocm @skipIfXpu + @skipIf(GPU_TYPE == "mps", "Not applicable to MPS") def test_tanh_approx(self): def fn(inp): return torch.ops.test_inductor_ops.tanh_approx(inp) @@ -202,6 +206,7 @@ class TestCustomLowering(InductorTestCase): @requires_gpu() @skipIfRocm @skipIfXpu + @skipIf(GPU_TYPE == "mps", "Not applicable to MPS") def test_multi_inp_asm(self): def fn(a, b): return torch.ops.test_inductor_ops.add_custom(a, b) diff --git a/test/inductor/test_group_batch_fusion.py b/test/inductor/test_group_batch_fusion.py index 84b8aa9dcc7..4ce5605aaca 100644 --- a/test/inductor/test_group_batch_fusion.py +++ b/test/inductor/test_group_batch_fusion.py @@ -394,6 +394,7 @@ class TestGroupBatchFusion(TestCase): ) counters.clear() + @unittest.skipIf(GPU_TYPE == "mps", "welford_reduce is yet not implemented for MPS") def test_batch_layer_norm_fusion(self): for has_weight in [True, False]: for has_bias in [True, False]: diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py index 94ec64b1356..8cb033c9b72 100644 --- a/test/inductor/test_torchinductor.py +++ b/test/inductor/test_torchinductor.py @@ -806,6 +806,16 @@ def skip_if_triton(fn): return wrapper +def skip_if_not_triton(fn): + @functools.wraps(fn) + def wrapper(self): + if not is_triton_backend(self.device): + raise unittest.SkipTest(f"triton backend is required for {self.device}") + return fn(self) + + return wrapper + + def skip_if_dynamic(fn): @functools.wraps(fn) def wrapper(self): @@ -827,8 +837,11 @@ def is_mps_backend(device): def is_triton_backend(device): - if getattr(device, "type", device) == "cpu": + device_type = getattr(device, "type", device) + if device_type == "cpu": return config.cpu_backend == "triton" + if device_type == "mps": + return False return config.cuda_backend == "triton" @@ -1888,7 +1901,7 @@ class CommonTemplate: self.common(fn, (torch.full((4,), float("-inf")),)) - @requires_gpu() + @skip_if_not_triton def test_reduction_config_limit(self): """ This unit-test tests whether we exceed cudaDeviceProperties.maxGridSize in @@ -9634,7 +9647,12 @@ class CommonTemplate: @requires_gpu() @skip_if_halide # cascading accuracy issues due rsqrt fallback def test_tmp_not_defined_issue3(self): - from torch import device + test_device = torch.device(type=self.device) + test_device_0 = ( + torch.device(type=self.device, index=0) + if self.device != "cpu" + else test_device + ) def forward( self, @@ -9678,7 +9696,7 @@ class CommonTemplate: 1, dtype=torch.int32, layout=torch.strided, - device=device(type=GPU_TYPE, index=0), + device=test_device_0, pin_memory=False, ) @@ -9687,7 +9705,7 @@ class CommonTemplate: start=0, step=1, dtype=torch.int32, - device=device(type=GPU_TYPE), + device=test_device, requires_grad=False, ) @@ -9697,7 +9715,7 @@ class CommonTemplate: start=0, step=1001, dtype=torch.int32, - device=device(type=GPU_TYPE, index=0), + device=test_device_0, requires_grad=False, ) view: "i32[6150144]" = torch.ops.aten.reshape.default(mul, [-1]) @@ -9744,7 +9762,7 @@ class CommonTemplate: permute_1, ] - kwargs = aot_graph_input_parser(forward, device=GPU_TYPE) + kwargs = aot_graph_input_parser(forward, device=self.device) self.common(forward, [], kwargs=kwargs) @skip_if_gpu_halide @@ -10057,7 +10075,7 @@ class CommonTemplate: @tf32_on_and_off(0.005) def test_inductor_layout_optimization_input_mutations(self): # channel dim must be > 64 for inductor to do layout optimization and use NHWC - mod = nn.Conv2d(3, 128, 1, stride=1, bias=False).to(GPU_TYPE) + mod = nn.Conv2d(3, 128, 1, stride=1, bias=False).to(self.device) def f(x): x.mul_(2) @@ -10065,7 +10083,7 @@ class CommonTemplate: return out f_compiled = torch.compile(f) - x_ref = torch.rand(2, 3, 128, 128, device=GPU_TYPE) + x_ref = torch.rand(2, 3, 128, 128, device=self.device) x_test = x_ref.detach().clone() with torch.no_grad(): out_ref = f(x_ref) @@ -11403,8 +11421,8 @@ class CommonTemplate: def test_custom_op_fixed_layout_sequential(self): import torch.library - mod = nn.Conv2d(3, 128, 1, stride=1, bias=False).to(device=GPU_TYPE) - inp = torch.rand(2, 3, 128, 128, device=GPU_TYPE) + mod = nn.Conv2d(3, 128, 1, stride=1, bias=False).to(device=self.device) + inp = torch.rand(2, 3, 128, 128, device=self.device) expected_stride = mod(inp).stride() def bar(x): @@ -11440,8 +11458,8 @@ class CommonTemplate: @tf32_on_and_off(0.005) def test_mutable_custom_op_fixed_layout2(self): with torch.library._scoped_library("mylib", "DEF") as lib: - mod = nn.Conv2d(3, 128, 1, stride=1, bias=False).to(device=GPU_TYPE) - inp = torch.rand(2, 3, 128, 128, device=GPU_TYPE) + mod = nn.Conv2d(3, 128, 1, stride=1, bias=False).to(device=self.device) + inp = torch.rand(2, 3, 128, 128, device=self.device) expected_stride = mod(inp).clone().stride() lib.define( @@ -11841,7 +11859,7 @@ class CommonTemplate: for cpu_dtype in test_dtypes: if not self.is_dtype_supported(cpu_dtype): continue - x = torch.rand([20], device=GPU_TYPE) + x = torch.rand([20], device=self.device) y = torch.rand([4], device="cpu", dtype=cpu_dtype) self.common( fn, diff --git a/test/inductor/test_torchinductor_codegen_config_overrides.py b/test/inductor/test_torchinductor_codegen_config_overrides.py index 716f3c63da6..2032aa25428 100644 --- a/test/inductor/test_torchinductor_codegen_config_overrides.py +++ b/test/inductor/test_torchinductor_codegen_config_overrides.py @@ -1,6 +1,7 @@ # Owner(s): ["module: inductor"] import importlib from typing import Any, Callable, Optional +from unittest import skipIf import torch import torch.utils._pytree as pytree @@ -83,6 +84,7 @@ class CodegenInductorTest(InductorTestCase): self.count_code("= reinterpret_tensor(", code, 2) @requires_gpu() + @skipIf(GPU_TYPE == "mps", "Triton is not available for MPS") def test_kernel_fusion_thresholds(self): def func(a, b): tmp0 = a + 1 diff --git a/test/inductor/test_torchinductor_codegen_dynamic_shapes.py b/test/inductor/test_torchinductor_codegen_dynamic_shapes.py index 4a7042a8ab6..ed7e6b00d64 100644 --- a/test/inductor/test_torchinductor_codegen_dynamic_shapes.py +++ b/test/inductor/test_torchinductor_codegen_dynamic_shapes.py @@ -6,7 +6,11 @@ import sys import torch from torch._inductor.compile_fx import compile_fx from torch._inductor.test_case import TestCase -from torch.testing._internal.common_utils import TEST_WITH_ASAN, TEST_WITH_ROCM +from torch.testing._internal.common_utils import ( + IS_LINUX, + TEST_WITH_ASAN, + TEST_WITH_ROCM, +) from torch.testing._internal.inductor_utils import ( _check_has_dynamic_shape, GPU_TYPE, @@ -110,7 +114,6 @@ test_failures = { "test_conv2d_channels_last_dynamic_shapes": TestFailure(("cpu",)), "test_conv3d_dynamic_shapes": TestFailure(("cpu",)), "test_conv3d_channels_last_dynamic_shapes": TestFailure(("cpu",)), - "test_mutable_custom_op_fixed_layout2_dynamic_shapes": TestFailure(("cpu",)), "test_expand_dynamic_shapes": TestFailure(("cpu",)), "test_full_boolean_dynamic_shapes": TestFailure(("cpu",)), "test_glu_dynamic_shapes": TestFailure(("cpu",)), @@ -258,7 +261,7 @@ test_failures = { "test_zero_element_mutation_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")), "test_custom_op_3_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")), "test_custom_op_fixed_layout_sequential_dynamic_shapes": TestFailure( - ("cpu", "cuda", "xpu") + ("cuda", "xpu") if IS_LINUX else ("cpu", "cuda", "xpu") ), "test_cat_uint8_dynamic_shapes": TestFailure( ("cpu",) diff --git a/torch/testing/_internal/inductor_utils.py b/torch/testing/_internal/inductor_utils.py index a5c5684ed80..807bd3c9bed 100644 --- a/torch/testing/_internal/inductor_utils.py +++ b/torch/testing/_internal/inductor_utils.py @@ -52,6 +52,8 @@ HAS_CUDA = torch.cuda.is_available() and HAS_TRITON HAS_XPU = torch.xpu.is_available() and HAS_TRITON +HAS_MPS = torch.mps.is_available() + HAS_GPU = HAS_CUDA or HAS_XPU GPU_TYPE = get_gpu_type() @@ -110,7 +112,8 @@ def skip_windows_ci(name: str, file: str) -> None: sys.exit(0) raise unittest.SkipTest("requires sympy/functorch/filelock") -requires_gpu = functools.partial(unittest.skipIf, not HAS_GPU, "requires gpu") +# TODO: Remove HAS_MPS condition when `HAS_GPU` includes HAS_MPS +requires_gpu = functools.partial(unittest.skipIf, not (HAS_GPU or HAS_MPS), "requires gpu") requires_triton = functools.partial(unittest.skipIf, not HAS_TRITON, "requires triton") def requires_cuda_with_enough_memory(min_mem_required):