diff --git a/test/inductor/test_custom_lowering.py b/test/inductor/test_custom_lowering.py
index 17eb27ef4ec..4786a97429e 100644
--- a/test/inductor/test_custom_lowering.py
+++ b/test/inductor/test_custom_lowering.py
@@ -1,6 +1,7 @@
 # Owner(s): ["module: inductor"]
 
 from functools import partial
+from unittest import skipIf
 
 import torch
 from torch._inductor.ir import Pointwise
@@ -140,6 +141,7 @@ class TestCustomLowering(InductorTestCase):
         )(add_custom_lowering)
 
     @requires_gpu()
+    @skipIf(GPU_TYPE == "mps", "Not applicable to MPS")
     def test_jagged_to_padded_dense_sanity_cuda(self):
         def fn(inp, offsets, max_seq_len):
             return torch.ops.test_inductor_ops.jagged_to_padded_dense(
@@ -165,6 +167,7 @@ class TestCustomLowering(InductorTestCase):
         )
 
     @requires_gpu()
+    @skipIf(GPU_TYPE == "mps", "Not applicable to MPS")
     def test_jagged_to_padded_dense_zero_size(self):
         # Previously, the masking was being completely stripped for the
         # masked load of the input value. That would lead to an IMA
@@ -188,6 +191,7 @@ class TestCustomLowering(InductorTestCase):
     @requires_gpu()
     @skipIfRocm
     @skipIfXpu
+    @skipIf(GPU_TYPE == "mps", "Not applicable to MPS")
     def test_tanh_approx(self):
         def fn(inp):
             return torch.ops.test_inductor_ops.tanh_approx(inp)
@@ -202,6 +206,7 @@ class TestCustomLowering(InductorTestCase):
     @requires_gpu()
     @skipIfRocm
     @skipIfXpu
+    @skipIf(GPU_TYPE == "mps", "Not applicable to MPS")
     def test_multi_inp_asm(self):
         def fn(a, b):
             return torch.ops.test_inductor_ops.add_custom(a, b)
diff --git a/test/inductor/test_group_batch_fusion.py b/test/inductor/test_group_batch_fusion.py
index 84b8aa9dcc7..4ce5605aaca 100644
--- a/test/inductor/test_group_batch_fusion.py
+++ b/test/inductor/test_group_batch_fusion.py
@@ -394,6 +394,7 @@ class TestGroupBatchFusion(TestCase):
         )
         counters.clear()
 
+    @unittest.skipIf(GPU_TYPE == "mps", "welford_reduce is yet not implemented for MPS")
     def test_batch_layer_norm_fusion(self):
         for has_weight in [True, False]:
             for has_bias in [True, False]:
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 94ec64b1356..8cb033c9b72 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -806,6 +806,16 @@ def skip_if_triton(fn):
     return wrapper
 
 
+def skip_if_not_triton(fn):
+    @functools.wraps(fn)
+    def wrapper(self):
+        if not is_triton_backend(self.device):
+            raise unittest.SkipTest(f"triton backend is required for {self.device}")
+        return fn(self)
+
+    return wrapper
+
+
 def skip_if_dynamic(fn):
     @functools.wraps(fn)
     def wrapper(self):
@@ -827,8 +837,11 @@ def is_mps_backend(device):
 
 
 def is_triton_backend(device):
-    if getattr(device, "type", device) == "cpu":
+    device_type = getattr(device, "type", device)
+    if device_type == "cpu":
         return config.cpu_backend == "triton"
+    if device_type == "mps":
+        return False
     return config.cuda_backend == "triton"
 
 
@@ -1888,7 +1901,7 @@ class CommonTemplate:
 
         self.common(fn, (torch.full((4,), float("-inf")),))
 
-    @requires_gpu()
+    @skip_if_not_triton
     def test_reduction_config_limit(self):
         """
         This unit-test tests whether we exceed cudaDeviceProperties.maxGridSize in
@@ -9634,7 +9647,12 @@ class CommonTemplate:
     @requires_gpu()
     @skip_if_halide  # cascading accuracy issues due rsqrt fallback
     def test_tmp_not_defined_issue3(self):
-        from torch import device
+        test_device = torch.device(type=self.device)
+        test_device_0 = (
+            torch.device(type=self.device, index=0)
+            if self.device != "cpu"
+            else test_device
+        )
 
         def forward(
             self,
@@ -9678,7 +9696,7 @@ class CommonTemplate:
                 1,
                 dtype=torch.int32,
                 layout=torch.strided,
-                device=device(type=GPU_TYPE, index=0),
+                device=test_device_0,
                 pin_memory=False,
             )
 
@@ -9687,7 +9705,7 @@ class CommonTemplate:
                 start=0,
                 step=1,
                 dtype=torch.int32,
-                device=device(type=GPU_TYPE),
+                device=test_device,
                 requires_grad=False,
             )
 
@@ -9697,7 +9715,7 @@ class CommonTemplate:
                 start=0,
                 step=1001,
                 dtype=torch.int32,
-                device=device(type=GPU_TYPE, index=0),
+                device=test_device_0,
                 requires_grad=False,
             )
             view: "i32[6150144]" = torch.ops.aten.reshape.default(mul, [-1])
@@ -9744,7 +9762,7 @@ class CommonTemplate:
                 permute_1,
             ]
 
-        kwargs = aot_graph_input_parser(forward, device=GPU_TYPE)
+        kwargs = aot_graph_input_parser(forward, device=self.device)
         self.common(forward, [], kwargs=kwargs)
 
     @skip_if_gpu_halide
@@ -10057,7 +10075,7 @@ class CommonTemplate:
     @tf32_on_and_off(0.005)
     def test_inductor_layout_optimization_input_mutations(self):
         # channel dim must be > 64 for inductor to do layout optimization and use NHWC
-        mod = nn.Conv2d(3, 128, 1, stride=1, bias=False).to(GPU_TYPE)
+        mod = nn.Conv2d(3, 128, 1, stride=1, bias=False).to(self.device)
 
         def f(x):
             x.mul_(2)
@@ -10065,7 +10083,7 @@ class CommonTemplate:
             return out
 
         f_compiled = torch.compile(f)
-        x_ref = torch.rand(2, 3, 128, 128, device=GPU_TYPE)
+        x_ref = torch.rand(2, 3, 128, 128, device=self.device)
         x_test = x_ref.detach().clone()
         with torch.no_grad():
             out_ref = f(x_ref)
@@ -11403,8 +11421,8 @@ class CommonTemplate:
     def test_custom_op_fixed_layout_sequential(self):
         import torch.library
 
-        mod = nn.Conv2d(3, 128, 1, stride=1, bias=False).to(device=GPU_TYPE)
-        inp = torch.rand(2, 3, 128, 128, device=GPU_TYPE)
+        mod = nn.Conv2d(3, 128, 1, stride=1, bias=False).to(device=self.device)
+        inp = torch.rand(2, 3, 128, 128, device=self.device)
         expected_stride = mod(inp).stride()
 
         def bar(x):
@@ -11440,8 +11458,8 @@ class CommonTemplate:
     @tf32_on_and_off(0.005)
     def test_mutable_custom_op_fixed_layout2(self):
         with torch.library._scoped_library("mylib", "DEF") as lib:
-            mod = nn.Conv2d(3, 128, 1, stride=1, bias=False).to(device=GPU_TYPE)
-            inp = torch.rand(2, 3, 128, 128, device=GPU_TYPE)
+            mod = nn.Conv2d(3, 128, 1, stride=1, bias=False).to(device=self.device)
+            inp = torch.rand(2, 3, 128, 128, device=self.device)
             expected_stride = mod(inp).clone().stride()
 
             lib.define(
@@ -11841,7 +11859,7 @@ class CommonTemplate:
         for cpu_dtype in test_dtypes:
             if not self.is_dtype_supported(cpu_dtype):
                 continue
-            x = torch.rand([20], device=GPU_TYPE)
+            x = torch.rand([20], device=self.device)
             y = torch.rand([4], device="cpu", dtype=cpu_dtype)
             self.common(
                 fn,
diff --git a/test/inductor/test_torchinductor_codegen_config_overrides.py b/test/inductor/test_torchinductor_codegen_config_overrides.py
index 716f3c63da6..2032aa25428 100644
--- a/test/inductor/test_torchinductor_codegen_config_overrides.py
+++ b/test/inductor/test_torchinductor_codegen_config_overrides.py
@@ -1,6 +1,7 @@
 # Owner(s): ["module: inductor"]
 import importlib
 from typing import Any, Callable, Optional
+from unittest import skipIf
 
 import torch
 import torch.utils._pytree as pytree
@@ -83,6 +84,7 @@ class CodegenInductorTest(InductorTestCase):
             self.count_code("= reinterpret_tensor(", code, 2)
 
     @requires_gpu()
+    @skipIf(GPU_TYPE == "mps", "Triton is not available for MPS")
     def test_kernel_fusion_thresholds(self):
         def func(a, b):
             tmp0 = a + 1
diff --git a/test/inductor/test_torchinductor_codegen_dynamic_shapes.py b/test/inductor/test_torchinductor_codegen_dynamic_shapes.py
index 4a7042a8ab6..ed7e6b00d64 100644
--- a/test/inductor/test_torchinductor_codegen_dynamic_shapes.py
+++ b/test/inductor/test_torchinductor_codegen_dynamic_shapes.py
@@ -6,7 +6,11 @@ import sys
 import torch
 from torch._inductor.compile_fx import compile_fx
 from torch._inductor.test_case import TestCase
-from torch.testing._internal.common_utils import TEST_WITH_ASAN, TEST_WITH_ROCM
+from torch.testing._internal.common_utils import (
+    IS_LINUX,
+    TEST_WITH_ASAN,
+    TEST_WITH_ROCM,
+)
 from torch.testing._internal.inductor_utils import (
     _check_has_dynamic_shape,
     GPU_TYPE,
@@ -110,7 +114,6 @@ test_failures = {
     "test_conv2d_channels_last_dynamic_shapes": TestFailure(("cpu",)),
     "test_conv3d_dynamic_shapes": TestFailure(("cpu",)),
     "test_conv3d_channels_last_dynamic_shapes": TestFailure(("cpu",)),
-    "test_mutable_custom_op_fixed_layout2_dynamic_shapes": TestFailure(("cpu",)),
     "test_expand_dynamic_shapes": TestFailure(("cpu",)),
     "test_full_boolean_dynamic_shapes": TestFailure(("cpu",)),
     "test_glu_dynamic_shapes": TestFailure(("cpu",)),
@@ -258,7 +261,7 @@ test_failures = {
     "test_zero_element_mutation_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
     "test_custom_op_3_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
     "test_custom_op_fixed_layout_sequential_dynamic_shapes": TestFailure(
-        ("cpu", "cuda", "xpu")
+        ("cuda", "xpu") if IS_LINUX else ("cpu", "cuda", "xpu")
     ),
     "test_cat_uint8_dynamic_shapes": TestFailure(
         ("cpu",)
diff --git a/torch/testing/_internal/inductor_utils.py b/torch/testing/_internal/inductor_utils.py
index a5c5684ed80..807bd3c9bed 100644
--- a/torch/testing/_internal/inductor_utils.py
+++ b/torch/testing/_internal/inductor_utils.py
@@ -52,6 +52,8 @@ HAS_CUDA = torch.cuda.is_available() and HAS_TRITON
 
 HAS_XPU = torch.xpu.is_available() and HAS_TRITON
 
+HAS_MPS = torch.mps.is_available()
+
 HAS_GPU = HAS_CUDA or HAS_XPU
 
 GPU_TYPE = get_gpu_type()
@@ -110,7 +112,8 @@ def skip_windows_ci(name: str, file: str) -> None:
             sys.exit(0)
         raise unittest.SkipTest("requires sympy/functorch/filelock")
 
-requires_gpu = functools.partial(unittest.skipIf, not HAS_GPU, "requires gpu")
+# TODO: Remove HAS_MPS condition  when `HAS_GPU` includes HAS_MPS
+requires_gpu = functools.partial(unittest.skipIf, not (HAS_GPU or HAS_MPS), "requires gpu")
 requires_triton = functools.partial(unittest.skipIf, not HAS_TRITON, "requires triton")
 
 def requires_cuda_with_enough_memory(min_mem_required):