mirror of
https://github.com/saymrwulf/pytorch.git
synced 2026-05-14 20:57:59 +00:00
Make inductor_utils.requires_gpu accept MPS (#145156)
Not yet ready to setp HAS_GPU to true, but can unskip tests that require GPU (Noticed while running test_mps_basics.py that `test_scalar_cpu_tensor_arg` is getting skipped) - Replace `GPU_TYPE` with `self.device` in `test_custom_op_fixed_layout_sequential`, `test_inductor_layout_optimization_input_mutations`, `test_mutable_custom_op_fixed_layout2` otherwise they GPU tests are just running for _cpu suffixes. - Tweak `test_tmp_not_defined_issue3` to work correctly on CPU, by defining `test_device` and `test_device_0` - UnXFail `test_mutable_custom_op_fixed_layout2_dynamic_shapes` as it should just work on CPU - Add `skip_if_no_triton` decorator and decorate `test_reduction_config_limit` with it, as it does not need CPU nor GPU, but rather a triton backend. Pull Request resolved: https://github.com/pytorch/pytorch/pull/145156 Approved by: https://github.com/dcci, https://github.com/Skylion007, https://github.com/jansel
This commit is contained in:
parent
0dc03134d9
commit
6a985d8b2e
6 changed files with 50 additions and 18 deletions
|
|
@ -1,6 +1,7 @@
|
|||
# Owner(s): ["module: inductor"]
|
||||
|
||||
from functools import partial
|
||||
from unittest import skipIf
|
||||
|
||||
import torch
|
||||
from torch._inductor.ir import Pointwise
|
||||
|
|
@ -140,6 +141,7 @@ class TestCustomLowering(InductorTestCase):
|
|||
)(add_custom_lowering)
|
||||
|
||||
@requires_gpu()
|
||||
@skipIf(GPU_TYPE == "mps", "Not applicable to MPS")
|
||||
def test_jagged_to_padded_dense_sanity_cuda(self):
|
||||
def fn(inp, offsets, max_seq_len):
|
||||
return torch.ops.test_inductor_ops.jagged_to_padded_dense(
|
||||
|
|
@ -165,6 +167,7 @@ class TestCustomLowering(InductorTestCase):
|
|||
)
|
||||
|
||||
@requires_gpu()
|
||||
@skipIf(GPU_TYPE == "mps", "Not applicable to MPS")
|
||||
def test_jagged_to_padded_dense_zero_size(self):
|
||||
# Previously, the masking was being completely stripped for the
|
||||
# masked load of the input value. That would lead to an IMA
|
||||
|
|
@ -188,6 +191,7 @@ class TestCustomLowering(InductorTestCase):
|
|||
@requires_gpu()
|
||||
@skipIfRocm
|
||||
@skipIfXpu
|
||||
@skipIf(GPU_TYPE == "mps", "Not applicable to MPS")
|
||||
def test_tanh_approx(self):
|
||||
def fn(inp):
|
||||
return torch.ops.test_inductor_ops.tanh_approx(inp)
|
||||
|
|
@ -202,6 +206,7 @@ class TestCustomLowering(InductorTestCase):
|
|||
@requires_gpu()
|
||||
@skipIfRocm
|
||||
@skipIfXpu
|
||||
@skipIf(GPU_TYPE == "mps", "Not applicable to MPS")
|
||||
def test_multi_inp_asm(self):
|
||||
def fn(a, b):
|
||||
return torch.ops.test_inductor_ops.add_custom(a, b)
|
||||
|
|
|
|||
|
|
@ -394,6 +394,7 @@ class TestGroupBatchFusion(TestCase):
|
|||
)
|
||||
counters.clear()
|
||||
|
||||
@unittest.skipIf(GPU_TYPE == "mps", "welford_reduce is yet not implemented for MPS")
|
||||
def test_batch_layer_norm_fusion(self):
|
||||
for has_weight in [True, False]:
|
||||
for has_bias in [True, False]:
|
||||
|
|
|
|||
|
|
@ -806,6 +806,16 @@ def skip_if_triton(fn):
|
|||
return wrapper
|
||||
|
||||
|
||||
def skip_if_not_triton(fn):
|
||||
@functools.wraps(fn)
|
||||
def wrapper(self):
|
||||
if not is_triton_backend(self.device):
|
||||
raise unittest.SkipTest(f"triton backend is required for {self.device}")
|
||||
return fn(self)
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
def skip_if_dynamic(fn):
|
||||
@functools.wraps(fn)
|
||||
def wrapper(self):
|
||||
|
|
@ -827,8 +837,11 @@ def is_mps_backend(device):
|
|||
|
||||
|
||||
def is_triton_backend(device):
|
||||
if getattr(device, "type", device) == "cpu":
|
||||
device_type = getattr(device, "type", device)
|
||||
if device_type == "cpu":
|
||||
return config.cpu_backend == "triton"
|
||||
if device_type == "mps":
|
||||
return False
|
||||
return config.cuda_backend == "triton"
|
||||
|
||||
|
||||
|
|
@ -1888,7 +1901,7 @@ class CommonTemplate:
|
|||
|
||||
self.common(fn, (torch.full((4,), float("-inf")),))
|
||||
|
||||
@requires_gpu()
|
||||
@skip_if_not_triton
|
||||
def test_reduction_config_limit(self):
|
||||
"""
|
||||
This unit-test tests whether we exceed cudaDeviceProperties.maxGridSize in
|
||||
|
|
@ -9634,7 +9647,12 @@ class CommonTemplate:
|
|||
@requires_gpu()
|
||||
@skip_if_halide # cascading accuracy issues due rsqrt fallback
|
||||
def test_tmp_not_defined_issue3(self):
|
||||
from torch import device
|
||||
test_device = torch.device(type=self.device)
|
||||
test_device_0 = (
|
||||
torch.device(type=self.device, index=0)
|
||||
if self.device != "cpu"
|
||||
else test_device
|
||||
)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
|
|
@ -9678,7 +9696,7 @@ class CommonTemplate:
|
|||
1,
|
||||
dtype=torch.int32,
|
||||
layout=torch.strided,
|
||||
device=device(type=GPU_TYPE, index=0),
|
||||
device=test_device_0,
|
||||
pin_memory=False,
|
||||
)
|
||||
|
||||
|
|
@ -9687,7 +9705,7 @@ class CommonTemplate:
|
|||
start=0,
|
||||
step=1,
|
||||
dtype=torch.int32,
|
||||
device=device(type=GPU_TYPE),
|
||||
device=test_device,
|
||||
requires_grad=False,
|
||||
)
|
||||
|
||||
|
|
@ -9697,7 +9715,7 @@ class CommonTemplate:
|
|||
start=0,
|
||||
step=1001,
|
||||
dtype=torch.int32,
|
||||
device=device(type=GPU_TYPE, index=0),
|
||||
device=test_device_0,
|
||||
requires_grad=False,
|
||||
)
|
||||
view: "i32[6150144]" = torch.ops.aten.reshape.default(mul, [-1])
|
||||
|
|
@ -9744,7 +9762,7 @@ class CommonTemplate:
|
|||
permute_1,
|
||||
]
|
||||
|
||||
kwargs = aot_graph_input_parser(forward, device=GPU_TYPE)
|
||||
kwargs = aot_graph_input_parser(forward, device=self.device)
|
||||
self.common(forward, [], kwargs=kwargs)
|
||||
|
||||
@skip_if_gpu_halide
|
||||
|
|
@ -10057,7 +10075,7 @@ class CommonTemplate:
|
|||
@tf32_on_and_off(0.005)
|
||||
def test_inductor_layout_optimization_input_mutations(self):
|
||||
# channel dim must be > 64 for inductor to do layout optimization and use NHWC
|
||||
mod = nn.Conv2d(3, 128, 1, stride=1, bias=False).to(GPU_TYPE)
|
||||
mod = nn.Conv2d(3, 128, 1, stride=1, bias=False).to(self.device)
|
||||
|
||||
def f(x):
|
||||
x.mul_(2)
|
||||
|
|
@ -10065,7 +10083,7 @@ class CommonTemplate:
|
|||
return out
|
||||
|
||||
f_compiled = torch.compile(f)
|
||||
x_ref = torch.rand(2, 3, 128, 128, device=GPU_TYPE)
|
||||
x_ref = torch.rand(2, 3, 128, 128, device=self.device)
|
||||
x_test = x_ref.detach().clone()
|
||||
with torch.no_grad():
|
||||
out_ref = f(x_ref)
|
||||
|
|
@ -11403,8 +11421,8 @@ class CommonTemplate:
|
|||
def test_custom_op_fixed_layout_sequential(self):
|
||||
import torch.library
|
||||
|
||||
mod = nn.Conv2d(3, 128, 1, stride=1, bias=False).to(device=GPU_TYPE)
|
||||
inp = torch.rand(2, 3, 128, 128, device=GPU_TYPE)
|
||||
mod = nn.Conv2d(3, 128, 1, stride=1, bias=False).to(device=self.device)
|
||||
inp = torch.rand(2, 3, 128, 128, device=self.device)
|
||||
expected_stride = mod(inp).stride()
|
||||
|
||||
def bar(x):
|
||||
|
|
@ -11440,8 +11458,8 @@ class CommonTemplate:
|
|||
@tf32_on_and_off(0.005)
|
||||
def test_mutable_custom_op_fixed_layout2(self):
|
||||
with torch.library._scoped_library("mylib", "DEF") as lib:
|
||||
mod = nn.Conv2d(3, 128, 1, stride=1, bias=False).to(device=GPU_TYPE)
|
||||
inp = torch.rand(2, 3, 128, 128, device=GPU_TYPE)
|
||||
mod = nn.Conv2d(3, 128, 1, stride=1, bias=False).to(device=self.device)
|
||||
inp = torch.rand(2, 3, 128, 128, device=self.device)
|
||||
expected_stride = mod(inp).clone().stride()
|
||||
|
||||
lib.define(
|
||||
|
|
@ -11841,7 +11859,7 @@ class CommonTemplate:
|
|||
for cpu_dtype in test_dtypes:
|
||||
if not self.is_dtype_supported(cpu_dtype):
|
||||
continue
|
||||
x = torch.rand([20], device=GPU_TYPE)
|
||||
x = torch.rand([20], device=self.device)
|
||||
y = torch.rand([4], device="cpu", dtype=cpu_dtype)
|
||||
self.common(
|
||||
fn,
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
# Owner(s): ["module: inductor"]
|
||||
import importlib
|
||||
from typing import Any, Callable, Optional
|
||||
from unittest import skipIf
|
||||
|
||||
import torch
|
||||
import torch.utils._pytree as pytree
|
||||
|
|
@ -83,6 +84,7 @@ class CodegenInductorTest(InductorTestCase):
|
|||
self.count_code("= reinterpret_tensor(", code, 2)
|
||||
|
||||
@requires_gpu()
|
||||
@skipIf(GPU_TYPE == "mps", "Triton is not available for MPS")
|
||||
def test_kernel_fusion_thresholds(self):
|
||||
def func(a, b):
|
||||
tmp0 = a + 1
|
||||
|
|
|
|||
|
|
@ -6,7 +6,11 @@ import sys
|
|||
import torch
|
||||
from torch._inductor.compile_fx import compile_fx
|
||||
from torch._inductor.test_case import TestCase
|
||||
from torch.testing._internal.common_utils import TEST_WITH_ASAN, TEST_WITH_ROCM
|
||||
from torch.testing._internal.common_utils import (
|
||||
IS_LINUX,
|
||||
TEST_WITH_ASAN,
|
||||
TEST_WITH_ROCM,
|
||||
)
|
||||
from torch.testing._internal.inductor_utils import (
|
||||
_check_has_dynamic_shape,
|
||||
GPU_TYPE,
|
||||
|
|
@ -110,7 +114,6 @@ test_failures = {
|
|||
"test_conv2d_channels_last_dynamic_shapes": TestFailure(("cpu",)),
|
||||
"test_conv3d_dynamic_shapes": TestFailure(("cpu",)),
|
||||
"test_conv3d_channels_last_dynamic_shapes": TestFailure(("cpu",)),
|
||||
"test_mutable_custom_op_fixed_layout2_dynamic_shapes": TestFailure(("cpu",)),
|
||||
"test_expand_dynamic_shapes": TestFailure(("cpu",)),
|
||||
"test_full_boolean_dynamic_shapes": TestFailure(("cpu",)),
|
||||
"test_glu_dynamic_shapes": TestFailure(("cpu",)),
|
||||
|
|
@ -258,7 +261,7 @@ test_failures = {
|
|||
"test_zero_element_mutation_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
|
||||
"test_custom_op_3_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
|
||||
"test_custom_op_fixed_layout_sequential_dynamic_shapes": TestFailure(
|
||||
("cpu", "cuda", "xpu")
|
||||
("cuda", "xpu") if IS_LINUX else ("cpu", "cuda", "xpu")
|
||||
),
|
||||
"test_cat_uint8_dynamic_shapes": TestFailure(
|
||||
("cpu",)
|
||||
|
|
|
|||
|
|
@ -52,6 +52,8 @@ HAS_CUDA = torch.cuda.is_available() and HAS_TRITON
|
|||
|
||||
HAS_XPU = torch.xpu.is_available() and HAS_TRITON
|
||||
|
||||
HAS_MPS = torch.mps.is_available()
|
||||
|
||||
HAS_GPU = HAS_CUDA or HAS_XPU
|
||||
|
||||
GPU_TYPE = get_gpu_type()
|
||||
|
|
@ -110,7 +112,8 @@ def skip_windows_ci(name: str, file: str) -> None:
|
|||
sys.exit(0)
|
||||
raise unittest.SkipTest("requires sympy/functorch/filelock")
|
||||
|
||||
requires_gpu = functools.partial(unittest.skipIf, not HAS_GPU, "requires gpu")
|
||||
# TODO: Remove HAS_MPS condition when `HAS_GPU` includes HAS_MPS
|
||||
requires_gpu = functools.partial(unittest.skipIf, not (HAS_GPU or HAS_MPS), "requires gpu")
|
||||
requires_triton = functools.partial(unittest.skipIf, not HAS_TRITON, "requires triton")
|
||||
|
||||
def requires_cuda_with_enough_memory(min_mem_required):
|
||||
|
|
|
|||
Loading…
Reference in a new issue