2023-12-14 00:40:20 +00:00
|
|
|
# Owner(s): ["oncall: cpu inductor"]
|
2023-11-15 01:44:26 +00:00
|
|
|
import sys
|
|
|
|
|
import unittest
|
2023-04-04 23:57:50 +00:00
|
|
|
from typing import NamedTuple
|
2023-03-28 00:30:19 +00:00
|
|
|
|
2023-06-29 00:51:56 +00:00
|
|
|
import torch
|
2023-03-28 00:30:19 +00:00
|
|
|
from torch._inductor import config
|
2024-03-13 19:15:48 +00:00
|
|
|
from torch._inductor.test_case import TestCase as InductorTestCase
|
2023-12-10 15:21:24 +00:00
|
|
|
from torch.testing._internal.common_device_type import (
|
|
|
|
|
get_desired_device_type_test_bases,
|
|
|
|
|
)
|
2024-08-07 19:38:01 +00:00
|
|
|
from torch.testing._internal.common_utils import (
|
|
|
|
|
IS_MACOS,
|
|
|
|
|
IS_WINDOWS,
|
|
|
|
|
slowTest,
|
2024-10-17 23:50:18 +00:00
|
|
|
TEST_MKL,
|
2024-08-07 19:38:01 +00:00
|
|
|
TEST_WITH_ROCM,
|
|
|
|
|
)
|
2023-12-14 00:40:20 +00:00
|
|
|
from torch.testing._internal.inductor_utils import HAS_CPU
|
2023-11-15 01:44:26 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
try:
|
|
|
|
|
from . import (
|
|
|
|
|
test_cpu_repro,
|
2024-07-26 10:17:29 +00:00
|
|
|
test_cpu_select_algorithm,
|
2023-11-15 01:44:26 +00:00
|
|
|
test_mkldnn_pattern_matcher,
|
|
|
|
|
test_torchinductor,
|
|
|
|
|
test_torchinductor_dynamic_shapes,
|
|
|
|
|
)
|
|
|
|
|
except ImportError:
|
2024-09-13 02:04:34 +00:00
|
|
|
import test_cpu_repro # @manual=fbcode//caffe2/test/inductor:test_cpu_repro-library
|
|
|
|
|
import test_cpu_select_algorithm # @manual=fbcode//caffe2/test/inductor:cpu_select_algorithm_cpu-library
|
|
|
|
|
import test_mkldnn_pattern_matcher # @manual
|
|
|
|
|
import test_torchinductor # @manual=fbcode//caffe2/test/inductor:test_inductor-library
|
|
|
|
|
import test_torchinductor_dynamic_shapes # @manual=fbcode//caffe2/test/inductor:test_inductor-library_dynamic_shapes
|
2023-11-15 01:44:26 +00:00
|
|
|
except unittest.SkipTest:
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
sys.exit(0)
|
|
|
|
|
raise
|
2023-03-28 00:30:19 +00:00
|
|
|
|
|
|
|
|
|
2023-12-10 15:21:24 +00:00
|
|
|
_desired_test_bases = get_desired_device_type_test_bases()
|
|
|
|
|
RUN_CPU = (
|
|
|
|
|
HAS_CPU
|
|
|
|
|
and any(getattr(x, "device_type", "") == "cpu" for x in _desired_test_bases)
|
|
|
|
|
and not IS_MACOS
|
|
|
|
|
)
|
2023-04-06 21:00:39 +00:00
|
|
|
|
|
|
|
|
|
2023-04-04 23:57:50 +00:00
|
|
|
class CppWrapperTemplate:
|
2023-03-28 00:30:19 +00:00
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
2024-03-13 19:15:48 +00:00
|
|
|
class TestCppWrapper(InductorTestCase):
|
2023-04-04 23:57:50 +00:00
|
|
|
device = "cpu"
|
|
|
|
|
|
|
|
|
|
|
2024-03-13 19:15:48 +00:00
|
|
|
class DynamicShapesCppWrapperCpuTests(InductorTestCase):
|
2023-05-22 22:37:04 +00:00
|
|
|
device = "cpu"
|
|
|
|
|
|
|
|
|
|
|
2023-05-26 11:07:46 +00:00
|
|
|
test_failures_cpp_wrapper = {
|
2023-08-18 01:12:26 +00:00
|
|
|
# conv2d will fallback for dynamic shapes; the fallback path is not yet supported
|
2023-11-15 01:44:26 +00:00
|
|
|
"test_conv2d_unary_cpu_dynamic_shapes": test_torchinductor.TestFailure(
|
2023-05-26 11:07:46 +00:00
|
|
|
("cpp_wrapper",), is_skip=True
|
|
|
|
|
),
|
2023-11-15 01:44:26 +00:00
|
|
|
"test_conv2d_binary_inplace_fusion_failed_cpu_dynamic_shapes": test_torchinductor.TestFailure(
|
|
|
|
|
("cpp_wrapper",), is_skip=True
|
|
|
|
|
),
|
|
|
|
|
"test_conv2d_binary_inplace_fusion_pass_cpu_dynamic_shapes": test_torchinductor.TestFailure(
|
2023-05-31 13:50:37 +00:00
|
|
|
("cpp_wrapper",), is_skip=True
|
|
|
|
|
),
|
2023-08-18 01:12:26 +00:00
|
|
|
# aten._native_multi_head_attention.default is not yet supported for dynamic shapes
|
2023-11-15 01:44:26 +00:00
|
|
|
"test_multihead_attention_cpu_dynamic_shapes": test_torchinductor.TestFailure(
|
2023-08-18 01:12:26 +00:00
|
|
|
("cpp_wrapper",), is_skip=True
|
|
|
|
|
),
|
2023-05-26 11:07:46 +00:00
|
|
|
}
|
2024-06-07 16:23:04 +00:00
|
|
|
if TEST_WITH_ROCM:
|
|
|
|
|
test_failures_cpp_wrapper.update(
|
|
|
|
|
{
|
|
|
|
|
"test_linear_packed": test_torchinductor.TestFailure(
|
|
|
|
|
("cpp_wrapper"), is_skip=True
|
|
|
|
|
),
|
|
|
|
|
"test_linear_packed_dynamic_shapes": test_torchinductor.TestFailure(
|
|
|
|
|
("cpp_wrapper"), is_skip=True
|
|
|
|
|
),
|
|
|
|
|
}
|
|
|
|
|
)
|
2024-03-07 01:57:15 +00:00
|
|
|
|
2023-05-26 11:07:46 +00:00
|
|
|
|
2023-11-30 06:32:41 +00:00
|
|
|
def make_test_case(
|
|
|
|
|
name,
|
|
|
|
|
device,
|
|
|
|
|
tests,
|
|
|
|
|
condition=True,
|
|
|
|
|
slow=False,
|
|
|
|
|
func_inputs=None,
|
|
|
|
|
code_string_count=None,
|
|
|
|
|
):
|
2023-04-04 23:57:50 +00:00
|
|
|
test_name = f"{name}_{device}" if device else name
|
2023-11-30 06:32:41 +00:00
|
|
|
if code_string_count is None:
|
|
|
|
|
code_string_count = {}
|
2023-03-28 00:30:19 +00:00
|
|
|
|
2023-06-15 12:25:51 +00:00
|
|
|
func = getattr(tests, test_name)
|
|
|
|
|
assert callable(func), "not a callable"
|
|
|
|
|
func = slowTest(func) if slow else func
|
|
|
|
|
|
2024-09-11 14:08:40 +00:00
|
|
|
@config.patch(cpp_wrapper=True, search_autotune_cache=False)
|
2023-03-28 00:30:19 +00:00
|
|
|
def fn(self):
|
|
|
|
|
tests.setUpClass()
|
|
|
|
|
tests.setUp()
|
|
|
|
|
try:
|
2024-04-09 17:56:26 +00:00
|
|
|
with torch._C._PreserveDispatchKeyGuard():
|
|
|
|
|
torch._C._dispatch_tls_set_dispatch_key_included(
|
|
|
|
|
torch._C.DispatchKey.Dense, True
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
_, code = test_torchinductor.run_and_get_cpp_code(
|
|
|
|
|
func, *func_inputs if func_inputs else []
|
|
|
|
|
)
|
2025-02-04 18:04:10 +00:00
|
|
|
# If a test generates no code, skip the remaining checks. This can
|
|
|
|
|
# happen for tests validating build-dependent features (e.g. datatypes
|
|
|
|
|
# that are available on some platforms and not others).
|
|
|
|
|
if code:
|
|
|
|
|
self.assertIn("CppWrapperCodeCache", code)
|
|
|
|
|
self.assertTrue(
|
|
|
|
|
all(
|
|
|
|
|
code.count(string) == code_string_count[string]
|
|
|
|
|
for string in code_string_count
|
|
|
|
|
)
|
2024-04-09 17:56:26 +00:00
|
|
|
)
|
2023-03-28 00:30:19 +00:00
|
|
|
finally:
|
|
|
|
|
tests.tearDown()
|
|
|
|
|
tests.tearDownClass()
|
|
|
|
|
|
|
|
|
|
fn.__name__ = test_name
|
2023-06-15 12:25:51 +00:00
|
|
|
import copy
|
|
|
|
|
|
|
|
|
|
fn.__dict__ = copy.deepcopy(func.__dict__)
|
2023-04-27 16:12:39 +00:00
|
|
|
if condition:
|
|
|
|
|
setattr(
|
2023-12-14 00:40:20 +00:00
|
|
|
CppWrapperTemplate,
|
2023-04-27 16:12:39 +00:00
|
|
|
test_name,
|
|
|
|
|
fn,
|
|
|
|
|
)
|
2023-04-04 23:57:50 +00:00
|
|
|
|
|
|
|
|
|
2023-04-06 21:00:39 +00:00
|
|
|
if RUN_CPU:
|
2023-04-04 23:57:50 +00:00
|
|
|
|
|
|
|
|
class BaseTest(NamedTuple):
|
|
|
|
|
name: str
|
|
|
|
|
device: str = "cpu"
|
2024-03-13 19:15:48 +00:00
|
|
|
tests: InductorTestCase = test_torchinductor.CpuTests()
|
2023-04-27 16:12:39 +00:00
|
|
|
condition: bool = True
|
2023-05-26 11:07:46 +00:00
|
|
|
slow: bool = False
|
2023-05-26 11:07:47 +00:00
|
|
|
func_inputs: list = None
|
2023-11-30 06:32:41 +00:00
|
|
|
code_string_count: dict = {}
|
2023-04-04 23:57:50 +00:00
|
|
|
|
|
|
|
|
for item in [
|
2024-05-02 15:09:31 +00:00
|
|
|
BaseTest("test_add_complex"),
|
2024-01-31 19:34:55 +00:00
|
|
|
BaseTest("test_add_complex4"),
|
2023-04-04 23:57:50 +00:00
|
|
|
BaseTest("test_as_strided"), # buffer reuse
|
2023-12-19 13:20:13 +00:00
|
|
|
BaseTest("test_bernoulli1"),
|
2023-04-04 23:57:50 +00:00
|
|
|
BaseTest("test_bitwise"), # int32
|
|
|
|
|
BaseTest("test_bmm1"),
|
|
|
|
|
BaseTest("test_bmm2"),
|
|
|
|
|
BaseTest("test_cat"), # alias
|
2023-05-26 11:07:47 +00:00
|
|
|
BaseTest(
|
|
|
|
|
"test_conv2d_binary_inplace_fusion_failed",
|
|
|
|
|
"cpu",
|
2023-07-12 00:44:13 +00:00
|
|
|
test_mkldnn_pattern_matcher.TestPatternMatcher(),
|
2023-06-10 05:17:17 +00:00
|
|
|
condition=torch.backends.mkldnn.is_available(),
|
2023-05-26 11:07:47 +00:00
|
|
|
func_inputs=[
|
2024-10-15 20:29:45 +00:00
|
|
|
["aoti_torch_cpu_mkldnn__convolution_pointwise_binary("],
|
|
|
|
|
["aoti_torch_cpu_mkldnn__convolution_pointwise_binary_("],
|
2023-05-26 11:07:47 +00:00
|
|
|
],
|
|
|
|
|
),
|
2023-05-31 13:50:37 +00:00
|
|
|
BaseTest(
|
|
|
|
|
"test_conv2d_binary_inplace_fusion_pass",
|
|
|
|
|
"cpu",
|
2023-07-12 00:44:13 +00:00
|
|
|
test_mkldnn_pattern_matcher.TestPatternMatcher(),
|
2023-06-10 05:17:17 +00:00
|
|
|
condition=torch.backends.mkldnn.is_available(),
|
2023-05-31 13:50:37 +00:00
|
|
|
func_inputs=[
|
2024-10-15 20:29:45 +00:00
|
|
|
["aoti_torch_cpu_mkldnn__convolution_pointwise_binary_("],
|
|
|
|
|
["aoti_torch_cpu_mkldnn__convolution_pointwise_binary("],
|
2023-05-31 13:50:37 +00:00
|
|
|
],
|
|
|
|
|
),
|
2023-05-26 11:07:46 +00:00
|
|
|
BaseTest(
|
|
|
|
|
"test_conv2d_unary",
|
|
|
|
|
"cpu",
|
2023-07-12 00:44:13 +00:00
|
|
|
test_mkldnn_pattern_matcher.TestPatternMatcher(),
|
2023-06-10 05:17:17 +00:00
|
|
|
condition=torch.backends.mkldnn.is_available(),
|
2023-05-26 11:07:46 +00:00
|
|
|
slow=True,
|
|
|
|
|
),
|
2023-06-09 08:16:00 +00:00
|
|
|
BaseTest("test_conv_transpose2d_packed", "cpu", test_cpu_repro.CPUReproTests()),
|
2024-01-02 14:47:17 +00:00
|
|
|
BaseTest("test_cumsum"),
|
2024-03-25 15:41:38 +00:00
|
|
|
BaseTest("test_custom_op_1"),
|
|
|
|
|
BaseTest("test_custom_op_2"),
|
|
|
|
|
BaseTest("test_custom_op_3"),
|
2023-05-15 01:38:12 +00:00
|
|
|
BaseTest("test_dtype_sympy_expr"),
|
2023-04-25 17:01:10 +00:00
|
|
|
BaseTest("test_embedding_bag"), # test default FallbackKernel
|
2023-12-19 13:20:13 +00:00
|
|
|
BaseTest("test_index_put1"),
|
2023-04-13 15:41:03 +00:00
|
|
|
BaseTest("test_index_put_deterministic_fallback"),
|
2023-08-29 19:57:13 +00:00
|
|
|
BaseTest("test_adding_tensor_offsets"),
|
2024-04-05 18:05:59 +00:00
|
|
|
BaseTest("test_inductor_layout_optimization_input_mutations"),
|
2023-04-04 23:57:50 +00:00
|
|
|
BaseTest("test_int_div", "", test_cpu_repro.CPUReproTests()),
|
2024-10-29 16:31:27 +00:00
|
|
|
BaseTest("test_int8_weight_only_quant"),
|
2023-04-04 23:57:50 +00:00
|
|
|
BaseTest("test_linear1"),
|
|
|
|
|
BaseTest("test_linear2"),
|
2024-07-26 10:17:29 +00:00
|
|
|
*[
|
|
|
|
|
BaseTest(func, "", test_cpu_select_algorithm.TestSelectAlgorithmCPU())
|
|
|
|
|
for func in dir(test_cpu_select_algorithm.TestSelectAlgorithmCPU())
|
[Inductor][CPP] Enable Grouped GEMM Template (#143796)
**Summary**
Enable the CPP Grouped GEMM Fusion, lowering and Grouped GEMM Template following the RFC: https://github.com/pytorch/pytorch/issues/144012
- Support flexible number of GEMMs
- Share activation across GEMMs
- The Grouped GEMM Template supports independent activations
- However, the pattern matcher requires an anchor node, which is as the shared activation across GEMMs
- Each GEMM can have a unique weight but same sizes
- Each GEMM can have a unique bias or None
- Current PR does not yet support biases; this will be addressed in a follow-up epilogue fusion PR
- Each GEMM have its own epilogues
- Epilogue fusion is not yet supported in this PR and will be enabled in an upcoming follow-up epilogue fusion PR
**Test Plan**
```
python -u -m pytest -s -v test/inductor/test_cpu_select_algorithm.py -k test_grouped_linear
python -u -m pytest -s -v test/inductor/test_cpu_select_algorithm.py -k test_grouped_linear_invalid
python -u -m pytest -s -v test/inductor/test_cpu_cpp_wrapper.py -k test_grouped_linear
```
**Example**
Here is the example and generated code
```
batch_size = 4
in_features = 512
out_features = 1024
dtype = torch.bfloat16
class M(torch.nn.Module):
def __init__(self, bias):
super().__init__()
self.linear0 = torch.nn.Linear(in_features, out_features, bias=False)
self.linear1 = torch.nn.Linear(in_features, out_features, bias=False)
def forward(self, x):
return self.linear0(x), self.linear1(x)
if __name__ == "__main__":
with torch.no_grad():
input = torch.randn(batch_size, in_features, dtype=dtype)
m = M(bias=bias).to(dtype=dtype).eval()
cm = torch.compile(m)
act_res = cm(input)
```
Generated Code: https://gist.github.com/leslie-fang-intel/ed2e8d23aeb3586eb504feeace692e16#file-grouped-gemm-generated-code-py
**Next Step**
- Support Epilogue fusion
Pull Request resolved: https://github.com/pytorch/pytorch/pull/143796
Approved by: https://github.com/jgong5, https://github.com/jansel
2025-01-13 08:03:44 +00:00
|
|
|
if func.startswith(
|
|
|
|
|
(
|
|
|
|
|
"test_linear_with_pointwise",
|
|
|
|
|
"test_grouped_linear",
|
|
|
|
|
)
|
|
|
|
|
)
|
2024-07-26 10:17:29 +00:00
|
|
|
],
|
2024-06-18 21:41:35 +00:00
|
|
|
BaseTest("test_polar"),
|
2023-04-27 16:12:39 +00:00
|
|
|
BaseTest(
|
|
|
|
|
"test_linear_binary",
|
|
|
|
|
"",
|
2023-07-12 00:44:13 +00:00
|
|
|
test_mkldnn_pattern_matcher.TestPatternMatcher(),
|
2023-06-10 05:17:17 +00:00
|
|
|
torch.backends.mkldnn.is_available()
|
|
|
|
|
and torch.ops.mkldnn._is_mkldnn_bf16_supported(),
|
2023-04-27 16:12:39 +00:00
|
|
|
),
|
2024-06-19 04:59:10 +00:00
|
|
|
BaseTest(
|
2024-06-20 02:13:31 +00:00
|
|
|
"test_linear_packed",
|
|
|
|
|
"",
|
|
|
|
|
test_cpu_repro.CPUReproTests(),
|
|
|
|
|
torch.backends.mkldnn.is_available()
|
|
|
|
|
and (
|
|
|
|
|
torch.ops.mkldnn._is_mkldnn_bf16_supported()
|
|
|
|
|
or torch.ops.mkldnn._is_mkldnn_fp16_supported()
|
|
|
|
|
),
|
2024-06-19 04:59:10 +00:00
|
|
|
),
|
2024-10-09 05:13:53 +00:00
|
|
|
*[
|
|
|
|
|
BaseTest(
|
|
|
|
|
func,
|
|
|
|
|
"",
|
|
|
|
|
test_cpu_repro.CPUReproTests(),
|
|
|
|
|
condition=torch.backends.mkldnn.is_available() and not IS_WINDOWS,
|
|
|
|
|
)
|
|
|
|
|
for func in dir(test_cpu_repro.CPUReproTests())
|
|
|
|
|
if func.startswith("test_lstm_packed_change_input_sizes")
|
|
|
|
|
],
|
enable test_max_pool2d6 after resolving empty array (#132219)
Related to Issue: https://github.com/pytorch/pytorch/issues/131335
Resolving PR: https://github.com/pytorch/pytorch/pull/132023
Test output:
```
(pytorch-3.10) [gabeferns@devvm2252.cco0 ~/pytorch (enable-test-max-pool2d6)]$ TORCHINDUCTOR_ABI_COMPATIBLE=1 python test/inductor/test_cpu_cpp_wrapper.py -k test_max_pool2d6
inline_call []
stats [('calls_captured', 3), ('unique_graphs', 1)]
inductor [('extern_calls', 3), ('fxgraph_cache_miss', 1)]
aot_autograd [('total', 1), ('ok', 1)]
.inline_call []
stats [('calls_captured', 3), ('unique_graphs', 1)]
aot_autograd [('total', 1), ('ok', 1)]
inductor [('extern_calls', 3), ('fxgraph_cache_miss', 1)]
.
----------------------------------------------------------------------
Ran 2 tests in 8.668s
OK
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/132219
Approved by: https://github.com/desertfire
2024-07-31 19:13:51 +00:00
|
|
|
BaseTest("test_max_pool2d6"),
|
2024-10-17 23:50:18 +00:00
|
|
|
BaseTest(
|
|
|
|
|
"test_mkl_linear", "", test_cpu_repro.CPUReproTests(), condition=TEST_MKL
|
|
|
|
|
),
|
2023-04-04 23:57:50 +00:00
|
|
|
BaseTest("test_mm_views"),
|
2023-08-18 01:12:26 +00:00
|
|
|
BaseTest("test_multihead_attention", "cpu", test_cpu_repro.CPUReproTests()),
|
2024-04-16 00:56:35 +00:00
|
|
|
BaseTest(
|
|
|
|
|
"test_multi_threading",
|
2024-08-07 19:38:01 +00:00
|
|
|
condition=not IS_WINDOWS,
|
2024-06-19 04:24:59 +00:00
|
|
|
# Two threads compile, so we expect the output code to be printed twice.
|
|
|
|
|
code_string_count={"py::gil_scoped_release release;": 2},
|
2024-04-16 00:56:35 +00:00
|
|
|
),
|
2023-04-04 23:57:50 +00:00
|
|
|
BaseTest("test_profiler_mark_wrapper_call"),
|
2023-10-30 07:09:54 +00:00
|
|
|
BaseTest(
|
|
|
|
|
"test_qconv2d",
|
|
|
|
|
"cpu",
|
|
|
|
|
test_mkldnn_pattern_matcher.TestPatternMatcher(),
|
2024-08-07 19:38:01 +00:00
|
|
|
condition=torch.backends.mkldnn.is_available() and not IS_WINDOWS,
|
2023-10-30 07:09:54 +00:00
|
|
|
),
|
|
|
|
|
BaseTest(
|
|
|
|
|
"test_qconv2d_relu",
|
|
|
|
|
"cpu",
|
|
|
|
|
test_mkldnn_pattern_matcher.TestPatternMatcher(),
|
2024-08-07 19:38:01 +00:00
|
|
|
condition=torch.backends.mkldnn.is_available() and not IS_WINDOWS,
|
2023-10-30 07:09:54 +00:00
|
|
|
),
|
|
|
|
|
BaseTest(
|
|
|
|
|
"test_qconv2d_add",
|
|
|
|
|
"cpu",
|
|
|
|
|
test_mkldnn_pattern_matcher.TestPatternMatcher(),
|
2024-08-07 19:38:01 +00:00
|
|
|
condition=torch.backends.mkldnn.is_available() and not IS_WINDOWS,
|
2023-10-30 07:09:54 +00:00
|
|
|
),
|
|
|
|
|
BaseTest(
|
|
|
|
|
"test_qconv2d_add_relu",
|
|
|
|
|
"cpu",
|
|
|
|
|
test_mkldnn_pattern_matcher.TestPatternMatcher(),
|
2024-08-07 19:38:01 +00:00
|
|
|
condition=torch.backends.mkldnn.is_available() and not IS_WINDOWS,
|
2023-10-30 07:09:54 +00:00
|
|
|
),
|
|
|
|
|
BaseTest(
|
|
|
|
|
"test_qconv2d_dequant_promotion",
|
|
|
|
|
"cpu",
|
|
|
|
|
test_mkldnn_pattern_matcher.TestPatternMatcher(),
|
2024-08-07 19:38:01 +00:00
|
|
|
condition=torch.backends.mkldnn.is_available() and not IS_WINDOWS,
|
2023-10-30 07:09:54 +00:00
|
|
|
),
|
2023-10-30 14:09:08 +00:00
|
|
|
BaseTest(
|
|
|
|
|
"test_qconv2d_maxpool2d_linear_dynamic",
|
|
|
|
|
"cpu",
|
|
|
|
|
test_mkldnn_pattern_matcher.TestDynamicPatternMatcher(),
|
2024-08-07 19:38:01 +00:00
|
|
|
condition=torch.backends.mkldnn.is_available() and not IS_WINDOWS,
|
2023-10-30 14:09:08 +00:00
|
|
|
func_inputs=[
|
2024-10-26 10:10:14 +00:00
|
|
|
[
|
2024-10-29 17:19:31 +00:00
|
|
|
"aoti_torch_cpu__qconv2d_pointwise_tensor",
|
2024-10-26 10:10:14 +00:00
|
|
|
"torch.ops.quantized.max_pool2d",
|
|
|
|
|
"aoti_torch_cpu__qlinear_pointwise_tensor",
|
|
|
|
|
]
|
2023-10-30 14:09:08 +00:00
|
|
|
],
|
|
|
|
|
),
|
2024-10-02 17:20:27 +00:00
|
|
|
*[
|
|
|
|
|
BaseTest(
|
|
|
|
|
func,
|
|
|
|
|
"",
|
|
|
|
|
test_mkldnn_pattern_matcher.TestPatternMatcher(),
|
|
|
|
|
condition=torch.backends.mkldnn.is_available() and not IS_WINDOWS,
|
|
|
|
|
)
|
|
|
|
|
for func in dir(test_mkldnn_pattern_matcher.TestPatternMatcher())
|
|
|
|
|
if func.startswith("test_qlinear")
|
|
|
|
|
],
|
2024-10-29 17:19:30 +00:00
|
|
|
BaseTest(
|
|
|
|
|
"test_qconv2d_with_concat",
|
|
|
|
|
"cpu",
|
|
|
|
|
test_mkldnn_pattern_matcher.TestPatternMatcher(),
|
|
|
|
|
condition=torch.backends.mkldnn.is_available() and not IS_WINDOWS,
|
|
|
|
|
),
|
[Quant][Inductor] Enable lowering of dynamic qlinear for X86Inductor (#120605)
**description**
Enable lowering of dynamic qlinear for X86Inductor. The pattern is `choose_qparams -> getitem -> q -> dq -> linear`. We only fuse `dq -> linear` and get `choose_qparams -> getitem -> q -> onednn.qlinear_pointwise`. So, we treat it as dynamic quantization of activation + static quantized linear.
The previous implementation of `onednn.qlinear_pointwise` is for the case where `x_scale` and `x_zp` are scalars. Since `choose_qparams` returns tensors, we added a variation `onednn.qlinear_pointwise.tensor` to support the case.
This feature is targeting PyTorch 2.3 release.
**Test plan**
```
python inductor/test_mkldnn_pattern_matcher.py -k test_dynamic_qlinear_cpu
python inductor/test_mkldnn_pattern_matcher.py -k test_dynamic_qlinear_qat_cpu
python inductor/test_cpu_cpp_wrapper.py -k test_dynamic_qlinear
```
**Performance before and after lowering `choose_qparam` to Inductor**
Before
- latency for shape (32, 32) = 0.151 ms
latency for shape (128, 128) = 0.153 ms
latency for shape (1024, 1024) = 0.247 ms
After
- latency for shape (32, 32) = 0.049 ms
- latency for shape (128, 128) = 0.052 ms
- latency for shape (1024, 1024) = 0.133 ms
Test method: A module with a single Linear layer, dynamic-quantize, lower to X86Inductor
Test env & config: Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz, single instance, single core, using Intel OpenMP and Tcmalloc
Pull Request resolved: https://github.com/pytorch/pytorch/pull/120605
Approved by: https://github.com/leslie-fang-intel, https://github.com/jgong5, https://github.com/jerryzh168
2024-03-02 05:11:13 +00:00
|
|
|
BaseTest(
|
|
|
|
|
"test_dynamic_qlinear",
|
|
|
|
|
"cpu",
|
|
|
|
|
test_mkldnn_pattern_matcher.TestPatternMatcher(),
|
2024-08-07 19:38:01 +00:00
|
|
|
condition=torch.backends.mkldnn.is_available() and not IS_WINDOWS,
|
[Quant][Inductor] Enable lowering of dynamic qlinear for X86Inductor (#120605)
**description**
Enable lowering of dynamic qlinear for X86Inductor. The pattern is `choose_qparams -> getitem -> q -> dq -> linear`. We only fuse `dq -> linear` and get `choose_qparams -> getitem -> q -> onednn.qlinear_pointwise`. So, we treat it as dynamic quantization of activation + static quantized linear.
The previous implementation of `onednn.qlinear_pointwise` is for the case where `x_scale` and `x_zp` are scalars. Since `choose_qparams` returns tensors, we added a variation `onednn.qlinear_pointwise.tensor` to support the case.
This feature is targeting PyTorch 2.3 release.
**Test plan**
```
python inductor/test_mkldnn_pattern_matcher.py -k test_dynamic_qlinear_cpu
python inductor/test_mkldnn_pattern_matcher.py -k test_dynamic_qlinear_qat_cpu
python inductor/test_cpu_cpp_wrapper.py -k test_dynamic_qlinear
```
**Performance before and after lowering `choose_qparam` to Inductor**
Before
- latency for shape (32, 32) = 0.151 ms
latency for shape (128, 128) = 0.153 ms
latency for shape (1024, 1024) = 0.247 ms
After
- latency for shape (32, 32) = 0.049 ms
- latency for shape (128, 128) = 0.052 ms
- latency for shape (1024, 1024) = 0.133 ms
Test method: A module with a single Linear layer, dynamic-quantize, lower to X86Inductor
Test env & config: Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz, single instance, single core, using Intel OpenMP and Tcmalloc
Pull Request resolved: https://github.com/pytorch/pytorch/pull/120605
Approved by: https://github.com/leslie-fang-intel, https://github.com/jgong5, https://github.com/jerryzh168
2024-03-02 05:11:13 +00:00
|
|
|
),
|
|
|
|
|
BaseTest(
|
|
|
|
|
"test_dynamic_qlinear_qat",
|
|
|
|
|
"cpu",
|
|
|
|
|
test_mkldnn_pattern_matcher.TestPatternMatcher(),
|
2024-08-07 19:38:01 +00:00
|
|
|
condition=torch.backends.mkldnn.is_available() and not IS_WINDOWS,
|
2023-10-30 14:09:07 +00:00
|
|
|
),
|
2023-06-24 17:58:43 +00:00
|
|
|
BaseTest("test_randint"),
|
2023-07-14 14:46:44 +00:00
|
|
|
BaseTest("test_randn_with_dtype_and_device"),
|
2023-04-04 23:57:50 +00:00
|
|
|
BaseTest("test_reduction1"), # Reduction
|
|
|
|
|
BaseTest("test_relu"), # multiple inputs
|
2023-05-14 15:17:21 +00:00
|
|
|
BaseTest("test_repeat_interleave", "", test_cpu_repro.CPUReproTests()),
|
2023-04-04 23:57:50 +00:00
|
|
|
BaseTest("test_scalar_input"),
|
2024-03-25 03:17:26 +00:00
|
|
|
BaseTest("test_scalar_output"),
|
Inductor cpp wrapper: fix codegen of positional args with default value (#108552)
Fixes https://github.com/pytorch/pytorch/issues/108323.
Cpp wrapper has functionality regression on `llama` and `tnt_s_patch16_224` due to recent support of scaled dot product flash attention in inductor.
The schema of this OP is as follows:
```
- func: _scaled_dot_product_flash_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, int max_q, int max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
```
For `llama` and `tnt_s_patch16_224`, the OP is called in the below way, where the three positional args with default values are not passed (`float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False`).
```python
y = torch.ops.aten._scaled_dot_product_flash_attention.default(x0, x1, x2, scale = 0.125)
```
This PR fixes the cpp wrapper support for this case.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/108552
Approved by: https://github.com/jgong5, https://github.com/desertfire, https://github.com/jansel
2023-09-06 09:39:36 +00:00
|
|
|
BaseTest("test_scaled_dot_product_attention"),
|
2023-07-10 10:46:21 +00:00
|
|
|
BaseTest("test_scatter1"),
|
|
|
|
|
BaseTest("test_scatter2"),
|
|
|
|
|
BaseTest("test_scatter3"),
|
|
|
|
|
BaseTest("test_scatter4"),
|
|
|
|
|
BaseTest("test_scatter5"),
|
|
|
|
|
BaseTest("test_scatter6"),
|
|
|
|
|
BaseTest("test_scatter_reduce1"),
|
|
|
|
|
BaseTest("test_scatter_reduce2"),
|
|
|
|
|
BaseTest("test_scatter_reduce3"),
|
2023-04-04 23:57:50 +00:00
|
|
|
BaseTest("test_silu"), # single input, single output
|
2023-05-05 21:16:15 +00:00
|
|
|
BaseTest("test_sort"),
|
2023-04-04 23:57:50 +00:00
|
|
|
BaseTest("test_sum_dtype"), # float64
|
|
|
|
|
BaseTest("test_sum_int"), # bool, int64, int8, uint8
|
2023-06-13 12:01:37 +00:00
|
|
|
BaseTest("test_tensor2"), # constant input
|
2023-11-30 06:32:41 +00:00
|
|
|
BaseTest(
|
|
|
|
|
"test_transpose", code_string_count={".reset();": 2}
|
|
|
|
|
), # multiple outputs, buffer clear
|
2023-06-12 19:54:20 +00:00
|
|
|
BaseTest("test_view_as_complex"),
|
2023-07-17 23:16:22 +00:00
|
|
|
BaseTest("test_view_as_real"),
|
2023-04-04 23:57:50 +00:00
|
|
|
]:
|
2023-05-26 11:07:47 +00:00
|
|
|
make_test_case(
|
|
|
|
|
item.name,
|
|
|
|
|
item.device,
|
|
|
|
|
item.tests,
|
|
|
|
|
item.condition,
|
|
|
|
|
item.slow,
|
|
|
|
|
item.func_inputs,
|
2023-11-30 06:32:41 +00:00
|
|
|
item.code_string_count,
|
2023-05-26 11:07:47 +00:00
|
|
|
)
|
2023-04-04 23:57:50 +00:00
|
|
|
|
2024-03-07 01:57:15 +00:00
|
|
|
test_torchinductor.copy_tests(
|
|
|
|
|
CppWrapperTemplate,
|
|
|
|
|
TestCppWrapper,
|
|
|
|
|
"cpp_wrapper",
|
|
|
|
|
test_failures_cpp_wrapper,
|
|
|
|
|
)
|
2023-03-28 00:30:19 +00:00
|
|
|
|
2023-05-22 22:37:04 +00:00
|
|
|
DynamicShapesCppWrapperTemplate = (
|
|
|
|
|
test_torchinductor_dynamic_shapes.make_dynamic_cls(CppWrapperTemplate)
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
test_torchinductor.copy_tests(
|
|
|
|
|
DynamicShapesCppWrapperTemplate,
|
|
|
|
|
DynamicShapesCppWrapperCpuTests,
|
|
|
|
|
"cpp_wrapper",
|
2023-05-26 11:07:46 +00:00
|
|
|
test_failures_cpp_wrapper,
|
2023-06-15 12:25:51 +00:00
|
|
|
xfail_prop="_expected_failure_dynamic_wrapper",
|
2023-05-22 22:37:04 +00:00
|
|
|
)
|
|
|
|
|
|
2023-04-06 21:00:39 +00:00
|
|
|
|
2023-03-28 00:30:19 +00:00
|
|
|
if __name__ == "__main__":
|
2024-03-13 19:15:48 +00:00
|
|
|
from torch._inductor.test_case import run_tests
|
2023-03-28 00:30:19 +00:00
|
|
|
|
2023-12-14 00:40:20 +00:00
|
|
|
if RUN_CPU:
|
2023-11-15 01:19:48 +00:00
|
|
|
run_tests(needs="filelock")
|