pytorch/test/inductor/test_cpu_cpp_wrapper.py

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

396 lines
13 KiB
Python
Raw Normal View History

# Owner(s): ["oncall: cpu inductor"]
import sys
import unittest
from typing import NamedTuple
import torch
from torch._inductor import config
from torch._inductor.test_case import TestCase as InductorTestCase
from torch.testing._internal.common_device_type import (
get_desired_device_type_test_bases,
)
from torch.testing._internal.common_utils import (
IS_MACOS,
IS_WINDOWS,
slowTest,
TEST_MKL,
TEST_WITH_ROCM,
)
from torch.testing._internal.inductor_utils import HAS_CPU
try:
try:
from . import (
test_cpu_repro,
test_cpu_select_algorithm,
test_mkldnn_pattern_matcher,
test_torchinductor,
test_torchinductor_dynamic_shapes,
)
except ImportError:
Fix lint errors in fbcode (#135614) Summary: Fixed a bunch of fbcode imports that happened to work but confused autodeps. After this autodeps still suggests "improvements" to TARGETS (which breaks our builds) but at least it can find all the imports. Test Plan: ``` fbpython fbcode/tools/build/buck/linters/lint_autoformat.py --linter=autodeps --default-exec-timeout=1800 -- fbcode/caffe2/TARGETS fbcode/caffe2/test/TARGETS ``` Before: ``` ERROR while processing caffe2/test/TARGETS: Cannot find an owner for "test_export" (from caffe2/test/export/testing.py:229) when processing rule "test_export". Please make sure it's listed in the srcs parameter of another rule. See https://fbur$ ERROR while processing caffe2/test/TARGETS: Cannot find an owner for "testing" (from caffe2/test/export/test_export.py:87) when processing rule "test_export". Please make sure it's listed in the srcs parameter of another rule. See https://fburl$ ERROR while processing caffe2/test/TARGETS: Cannot find an owner for "test_export" (from caffe2/test/export/test_serdes.py:9) when processing rule "test_export". Please make sure it's listed in the srcs parameter of another rule. See https://fb$ ERROR while processing caffe2/test/TARGETS: Cannot find an owner for "testing" (from caffe2/test/export/test_serdes.py:10) when processing rule "test_export". Please make sure it's listed in the srcs parameter of another rule. See https://fburl$ ERROR while processing caffe2/test/TARGETS: Cannot find an owner for "testing" (from caffe2/test/export/test_retraceability.py:7) when processing rule "test_export". Please make sure it's listed in the srcs parameter of another rule. See https:$ ERROR while processing caffe2/test/TARGETS: Cannot find an owner for "test_export" (from caffe2/test/export/test_retraceability.py:6) when processing rule "test_export". Please make sure it's listed in the srcs parameter of another rule. See ht$ ERROR while processing caffe2/test/TARGETS: Cannot find an owner for "testing" (from caffe2/test/export/test_export_nonstrict.py:7) when processing rule "test_export". Please make sure it's listed in the srcs parameter of another rule. See http$ ERROR while processing caffe2/test/TARGETS: Cannot find an owner for "test_export" (from caffe2/test/export/test_export_nonstrict.py:6) when processing rule "test_export". Please make sure it's listed in the srcs parameter of another rule. See $ ERROR while processing caffe2/test/TARGETS: Cannot find an owner for "test_export" (from caffe2/test/export/test_export_training_ir_to_run_decomp.py:8) when processing rule "test_export". Please make sure it's listed in the srcs parameter of an$ ERROR while processing caffe2/test/TARGETS: Cannot find an owner for "testing" (from caffe2/test/export/test_export_training_ir_to_run_decomp.py:10) when processing rule "test_export". Please make sure it's listed in the srcs parameter of anoth$ ERROR while processing caffe2/test/TARGETS: Found "//python/typeshed_internal:typeshed_internal_library" owner for "cv2" but it is protected by visibility rules: [] (from caffe2/test/test_bundled_images.py:7) when processing rule "test_bundled_$ ERROR while processing caffe2/test/TARGETS: Cannot find an owner for "caffe2.test.profiler_test_cpp_thread_lib" (from caffe2/test/profiler/test_cpp_thread.py:29) when processing rule "profiler_test_cpp_thread". Please make sure it's listed in t$ ERROR while processing caffe2/test/TARGETS: Cannot find an owner for "torch._utils_internal.get_file_path_2" (from caffe2/test/test_custom_ops.py:23) when processing rule "custom_ops". Please make sure it's listed in the srcs parameter of anoth$ ERROR while processing caffe2/test/TARGETS: Cannot find an owner for "torch._utils_internal.get_file_path_2" (from caffe2/test/test_public_bindings.py:13) when processing rule "public_bindings". Please make sure it's listed in the srcs paramete$ ERROR while processing caffe2/test/TARGETS: Cannot find an owner for "torch._C._profiler.symbolize_tracebacks" (from caffe2/test/test_cuda.py:3348) when processing rule "test_cuda". Please make sure it's listed in the srcs parameter of another $ ERROR while processing caffe2/test/TARGETS: Cannot find an owner for "torch._C._profiler.gather_traceback" (from caffe2/test/test_cuda.py:3348) when processing rule "test_cuda". Please make sure it's listed in the srcs parameter of another rule$ ERROR while processing caffe2/test/TARGETS: Cannot find an owner for include <torch/csrc/autograd/profiler_kineto.h> (from caffe2/test/profiler/test_cpp_thread.cpp:2) when processing profiler_test_cpp_thread_lib. Some things to try: ``` Differential Revision: D62049222 Pull Request resolved: https://github.com/pytorch/pytorch/pull/135614 Approved by: https://github.com/oulgen, https://github.com/laithsakka
2024-09-13 02:04:34 +00:00
import test_cpu_repro # @manual=fbcode//caffe2/test/inductor:test_cpu_repro-library
import test_cpu_select_algorithm # @manual=fbcode//caffe2/test/inductor:cpu_select_algorithm_cpu-library
import test_mkldnn_pattern_matcher # @manual
import test_torchinductor # @manual=fbcode//caffe2/test/inductor:test_inductor-library
import test_torchinductor_dynamic_shapes # @manual=fbcode//caffe2/test/inductor:test_inductor-library_dynamic_shapes
except unittest.SkipTest:
if __name__ == "__main__":
sys.exit(0)
raise
_desired_test_bases = get_desired_device_type_test_bases()
RUN_CPU = (
HAS_CPU
and any(getattr(x, "device_type", "") == "cpu" for x in _desired_test_bases)
and not IS_MACOS
)
class CppWrapperTemplate:
pass
class TestCppWrapper(InductorTestCase):
device = "cpu"
class DynamicShapesCppWrapperCpuTests(InductorTestCase):
device = "cpu"
test_failures_cpp_wrapper = {
# conv2d will fallback for dynamic shapes; the fallback path is not yet supported
"test_conv2d_unary_cpu_dynamic_shapes": test_torchinductor.TestFailure(
("cpp_wrapper",), is_skip=True
),
"test_conv2d_binary_inplace_fusion_failed_cpu_dynamic_shapes": test_torchinductor.TestFailure(
("cpp_wrapper",), is_skip=True
),
"test_conv2d_binary_inplace_fusion_pass_cpu_dynamic_shapes": test_torchinductor.TestFailure(
("cpp_wrapper",), is_skip=True
),
# aten._native_multi_head_attention.default is not yet supported for dynamic shapes
"test_multihead_attention_cpu_dynamic_shapes": test_torchinductor.TestFailure(
("cpp_wrapper",), is_skip=True
),
}
if TEST_WITH_ROCM:
test_failures_cpp_wrapper.update(
{
"test_linear_packed": test_torchinductor.TestFailure(
("cpp_wrapper"), is_skip=True
),
"test_linear_packed_dynamic_shapes": test_torchinductor.TestFailure(
("cpp_wrapper"), is_skip=True
),
}
)
def make_test_case(
name,
device,
tests,
condition=True,
slow=False,
func_inputs=None,
code_string_count=None,
):
test_name = f"{name}_{device}" if device else name
if code_string_count is None:
code_string_count = {}
func = getattr(tests, test_name)
assert callable(func), "not a callable"
func = slowTest(func) if slow else func
@config.patch(cpp_wrapper=True, search_autotune_cache=False)
def fn(self):
tests.setUpClass()
tests.setUp()
try:
Preserve dispatch state across function tracing (#122073) If we throw an exception in the "wrong" place we can end up with the dispatch state being in a weird state which can cause all future dispatching to fail. Preserve and restore it as part of `preserve_global_state` so we know it's sane after that. Also fake_tensor's in_kernel_invocation_manager() was leaving a bit set in the dispatcher (DispatchKey.Dense) which affected follow-on code. Fixed that to reset after as well. Repro: before: ``` $ rm test/dynamo_skips/TestSparseCPU.test_to_dense_with_gradcheck_sparse_cpu_complex64 $ PYTORCH_TEST_WITH_DYNAMO=1 pytest -s test/dynamo/test_export.py test/test_sparse.py -k 'test_to_dense_with_gradcheck_sparse_cpu_complex64' ======== 1 passed, 6173 deselected in 5.21s ============= $ PYTORCH_TEST_WITH_DYNAMO=1 pytest -s test/dynamo/test_export.py test/test_sparse.py -k 'test_torch_inference_mode_ctx or test_to_dense_with_gradcheck_sparse_cpu_complex64' ========= 1 skipped, 6172 deselected, 1 error in 5.29s ========= ``` (note that test_to_dense_with_gradcheck_sparse_cpu_complex64 passes on its own but failed when including the skipped test_export.py tests) after: ``` $ rm test/dynamo_skips/TestSparseCPU.test_to_dense_with_gradcheck_sparse_cpu_complex64 $ PYTORCH_TEST_WITH_DYNAMO=1 pytest -s test/dynamo/test_export.py test/test_sparse.py -k 'test_to_dense_with_gradcheck_sparse_cpu_complex64' ===================== 1 passed, 6173 deselected in 5.42s ===================== $ PYTORCH_TEST_WITH_DYNAMO=1 pytest -s test/dynamo/test_export.py test/test_sparse.py -k 'test_torch_inference_mode_ctx or test_to_dense_with_gradcheck_sparse_cpu_complex64' ===================== 1 passed, 1 skipped, 6172 deselected in 7.30s ====================== ``` (note that test_to_dense_with_gradcheck_sparse_cpu_complex64 passes in both runs) Pull Request resolved: https://github.com/pytorch/pytorch/pull/122073 Approved by: https://github.com/zou3519
2024-04-09 17:56:26 +00:00
with torch._C._PreserveDispatchKeyGuard():
torch._C._dispatch_tls_set_dispatch_key_included(
torch._C.DispatchKey.Dense, True
)
_, code = test_torchinductor.run_and_get_cpp_code(
func, *func_inputs if func_inputs else []
)
# If a test generates no code, skip the remaining checks. This can
# happen for tests validating build-dependent features (e.g. datatypes
# that are available on some platforms and not others).
if code:
self.assertIn("CppWrapperCodeCache", code)
self.assertTrue(
all(
code.count(string) == code_string_count[string]
for string in code_string_count
)
Preserve dispatch state across function tracing (#122073) If we throw an exception in the "wrong" place we can end up with the dispatch state being in a weird state which can cause all future dispatching to fail. Preserve and restore it as part of `preserve_global_state` so we know it's sane after that. Also fake_tensor's in_kernel_invocation_manager() was leaving a bit set in the dispatcher (DispatchKey.Dense) which affected follow-on code. Fixed that to reset after as well. Repro: before: ``` $ rm test/dynamo_skips/TestSparseCPU.test_to_dense_with_gradcheck_sparse_cpu_complex64 $ PYTORCH_TEST_WITH_DYNAMO=1 pytest -s test/dynamo/test_export.py test/test_sparse.py -k 'test_to_dense_with_gradcheck_sparse_cpu_complex64' ======== 1 passed, 6173 deselected in 5.21s ============= $ PYTORCH_TEST_WITH_DYNAMO=1 pytest -s test/dynamo/test_export.py test/test_sparse.py -k 'test_torch_inference_mode_ctx or test_to_dense_with_gradcheck_sparse_cpu_complex64' ========= 1 skipped, 6172 deselected, 1 error in 5.29s ========= ``` (note that test_to_dense_with_gradcheck_sparse_cpu_complex64 passes on its own but failed when including the skipped test_export.py tests) after: ``` $ rm test/dynamo_skips/TestSparseCPU.test_to_dense_with_gradcheck_sparse_cpu_complex64 $ PYTORCH_TEST_WITH_DYNAMO=1 pytest -s test/dynamo/test_export.py test/test_sparse.py -k 'test_to_dense_with_gradcheck_sparse_cpu_complex64' ===================== 1 passed, 6173 deselected in 5.42s ===================== $ PYTORCH_TEST_WITH_DYNAMO=1 pytest -s test/dynamo/test_export.py test/test_sparse.py -k 'test_torch_inference_mode_ctx or test_to_dense_with_gradcheck_sparse_cpu_complex64' ===================== 1 passed, 1 skipped, 6172 deselected in 7.30s ====================== ``` (note that test_to_dense_with_gradcheck_sparse_cpu_complex64 passes in both runs) Pull Request resolved: https://github.com/pytorch/pytorch/pull/122073 Approved by: https://github.com/zou3519
2024-04-09 17:56:26 +00:00
)
finally:
tests.tearDown()
tests.tearDownClass()
fn.__name__ = test_name
import copy
fn.__dict__ = copy.deepcopy(func.__dict__)
if condition:
setattr(
CppWrapperTemplate,
test_name,
fn,
)
if RUN_CPU:
class BaseTest(NamedTuple):
name: str
device: str = "cpu"
tests: InductorTestCase = test_torchinductor.CpuTests()
condition: bool = True
slow: bool = False
func_inputs: list = None
code_string_count: dict = {}
for item in [
BaseTest("test_add_complex"),
BaseTest("test_add_complex4"),
BaseTest("test_as_strided"), # buffer reuse
BaseTest("test_bernoulli1"),
BaseTest("test_bitwise"), # int32
BaseTest("test_bmm1"),
BaseTest("test_bmm2"),
BaseTest("test_cat"), # alias
BaseTest(
"test_conv2d_binary_inplace_fusion_failed",
"cpu",
test_mkldnn_pattern_matcher.TestPatternMatcher(),
condition=torch.backends.mkldnn.is_available(),
func_inputs=[
["aoti_torch_cpu_mkldnn__convolution_pointwise_binary("],
["aoti_torch_cpu_mkldnn__convolution_pointwise_binary_("],
],
),
BaseTest(
"test_conv2d_binary_inplace_fusion_pass",
"cpu",
test_mkldnn_pattern_matcher.TestPatternMatcher(),
condition=torch.backends.mkldnn.is_available(),
func_inputs=[
["aoti_torch_cpu_mkldnn__convolution_pointwise_binary_("],
["aoti_torch_cpu_mkldnn__convolution_pointwise_binary("],
],
),
BaseTest(
"test_conv2d_unary",
"cpu",
test_mkldnn_pattern_matcher.TestPatternMatcher(),
condition=torch.backends.mkldnn.is_available(),
slow=True,
),
BaseTest("test_conv_transpose2d_packed", "cpu", test_cpu_repro.CPUReproTests()),
BaseTest("test_cumsum"),
BaseTest("test_custom_op_1"),
BaseTest("test_custom_op_2"),
BaseTest("test_custom_op_3"),
BaseTest("test_dtype_sympy_expr"),
BaseTest("test_embedding_bag"), # test default FallbackKernel
BaseTest("test_index_put1"),
BaseTest("test_index_put_deterministic_fallback"),
BaseTest("test_adding_tensor_offsets"),
BaseTest("test_inductor_layout_optimization_input_mutations"),
BaseTest("test_int_div", "", test_cpu_repro.CPUReproTests()),
BaseTest("test_int8_weight_only_quant"),
BaseTest("test_linear1"),
BaseTest("test_linear2"),
*[
BaseTest(func, "", test_cpu_select_algorithm.TestSelectAlgorithmCPU())
for func in dir(test_cpu_select_algorithm.TestSelectAlgorithmCPU())
[Inductor][CPP] Enable Grouped GEMM Template (#143796) **Summary** Enable the CPP Grouped GEMM Fusion, lowering and Grouped GEMM Template following the RFC: https://github.com/pytorch/pytorch/issues/144012 - Support flexible number of GEMMs - Share activation across GEMMs - The Grouped GEMM Template supports independent activations - However, the pattern matcher requires an anchor node, which is as the shared activation across GEMMs - Each GEMM can have a unique weight but same sizes - Each GEMM can have a unique bias or None - Current PR does not yet support biases; this will be addressed in a follow-up epilogue fusion PR - Each GEMM have its own epilogues - Epilogue fusion is not yet supported in this PR and will be enabled in an upcoming follow-up epilogue fusion PR **Test Plan** ``` python -u -m pytest -s -v test/inductor/test_cpu_select_algorithm.py -k test_grouped_linear python -u -m pytest -s -v test/inductor/test_cpu_select_algorithm.py -k test_grouped_linear_invalid python -u -m pytest -s -v test/inductor/test_cpu_cpp_wrapper.py -k test_grouped_linear ``` **Example** Here is the example and generated code ``` batch_size = 4 in_features = 512 out_features = 1024 dtype = torch.bfloat16 class M(torch.nn.Module): def __init__(self, bias): super().__init__() self.linear0 = torch.nn.Linear(in_features, out_features, bias=False) self.linear1 = torch.nn.Linear(in_features, out_features, bias=False) def forward(self, x): return self.linear0(x), self.linear1(x) if __name__ == "__main__": with torch.no_grad(): input = torch.randn(batch_size, in_features, dtype=dtype) m = M(bias=bias).to(dtype=dtype).eval() cm = torch.compile(m) act_res = cm(input) ``` Generated Code: https://gist.github.com/leslie-fang-intel/ed2e8d23aeb3586eb504feeace692e16#file-grouped-gemm-generated-code-py **Next Step** - Support Epilogue fusion Pull Request resolved: https://github.com/pytorch/pytorch/pull/143796 Approved by: https://github.com/jgong5, https://github.com/jansel
2025-01-13 08:03:44 +00:00
if func.startswith(
(
"test_linear_with_pointwise",
"test_grouped_linear",
)
)
],
BaseTest("test_polar"),
BaseTest(
"test_linear_binary",
"",
test_mkldnn_pattern_matcher.TestPatternMatcher(),
torch.backends.mkldnn.is_available()
and torch.ops.mkldnn._is_mkldnn_bf16_supported(),
),
BaseTest(
"test_linear_packed",
"",
test_cpu_repro.CPUReproTests(),
torch.backends.mkldnn.is_available()
and (
torch.ops.mkldnn._is_mkldnn_bf16_supported()
or torch.ops.mkldnn._is_mkldnn_fp16_supported()
),
),
*[
BaseTest(
func,
"",
test_cpu_repro.CPUReproTests(),
condition=torch.backends.mkldnn.is_available() and not IS_WINDOWS,
)
for func in dir(test_cpu_repro.CPUReproTests())
if func.startswith("test_lstm_packed_change_input_sizes")
],
BaseTest("test_max_pool2d6"),
BaseTest(
"test_mkl_linear", "", test_cpu_repro.CPUReproTests(), condition=TEST_MKL
),
BaseTest("test_mm_views"),
BaseTest("test_multihead_attention", "cpu", test_cpu_repro.CPUReproTests()),
BaseTest(
"test_multi_threading",
condition=not IS_WINDOWS,
# Two threads compile, so we expect the output code to be printed twice.
code_string_count={"py::gil_scoped_release release;": 2},
),
BaseTest("test_profiler_mark_wrapper_call"),
BaseTest(
"test_qconv2d",
"cpu",
test_mkldnn_pattern_matcher.TestPatternMatcher(),
condition=torch.backends.mkldnn.is_available() and not IS_WINDOWS,
),
BaseTest(
"test_qconv2d_relu",
"cpu",
test_mkldnn_pattern_matcher.TestPatternMatcher(),
condition=torch.backends.mkldnn.is_available() and not IS_WINDOWS,
),
BaseTest(
"test_qconv2d_add",
"cpu",
test_mkldnn_pattern_matcher.TestPatternMatcher(),
condition=torch.backends.mkldnn.is_available() and not IS_WINDOWS,
),
BaseTest(
"test_qconv2d_add_relu",
"cpu",
test_mkldnn_pattern_matcher.TestPatternMatcher(),
condition=torch.backends.mkldnn.is_available() and not IS_WINDOWS,
),
BaseTest(
"test_qconv2d_dequant_promotion",
"cpu",
test_mkldnn_pattern_matcher.TestPatternMatcher(),
condition=torch.backends.mkldnn.is_available() and not IS_WINDOWS,
),
BaseTest(
"test_qconv2d_maxpool2d_linear_dynamic",
"cpu",
test_mkldnn_pattern_matcher.TestDynamicPatternMatcher(),
condition=torch.backends.mkldnn.is_available() and not IS_WINDOWS,
func_inputs=[
[
"aoti_torch_cpu__qconv2d_pointwise_tensor",
"torch.ops.quantized.max_pool2d",
"aoti_torch_cpu__qlinear_pointwise_tensor",
]
],
),
*[
BaseTest(
func,
"",
test_mkldnn_pattern_matcher.TestPatternMatcher(),
condition=torch.backends.mkldnn.is_available() and not IS_WINDOWS,
)
for func in dir(test_mkldnn_pattern_matcher.TestPatternMatcher())
if func.startswith("test_qlinear")
],
BaseTest(
"test_qconv2d_with_concat",
"cpu",
test_mkldnn_pattern_matcher.TestPatternMatcher(),
condition=torch.backends.mkldnn.is_available() and not IS_WINDOWS,
),
[Quant][Inductor] Enable lowering of dynamic qlinear for X86Inductor (#120605) **description** Enable lowering of dynamic qlinear for X86Inductor. The pattern is `choose_qparams -> getitem -> q -> dq -> linear`. We only fuse `dq -> linear` and get `choose_qparams -> getitem -> q -> onednn.qlinear_pointwise`. So, we treat it as dynamic quantization of activation + static quantized linear. The previous implementation of `onednn.qlinear_pointwise` is for the case where `x_scale` and `x_zp` are scalars. Since `choose_qparams` returns tensors, we added a variation `onednn.qlinear_pointwise.tensor` to support the case. This feature is targeting PyTorch 2.3 release. **Test plan** ``` python inductor/test_mkldnn_pattern_matcher.py -k test_dynamic_qlinear_cpu python inductor/test_mkldnn_pattern_matcher.py -k test_dynamic_qlinear_qat_cpu python inductor/test_cpu_cpp_wrapper.py -k test_dynamic_qlinear ``` **Performance before and after lowering `choose_qparam` to Inductor** Before - latency for shape (32, 32) = 0.151 ms latency for shape (128, 128) = 0.153 ms latency for shape (1024, 1024) = 0.247 ms After - latency for shape (32, 32) = 0.049 ms - latency for shape (128, 128) = 0.052 ms - latency for shape (1024, 1024) = 0.133 ms Test method: A module with a single Linear layer, dynamic-quantize, lower to X86Inductor Test env & config: Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz, single instance, single core, using Intel OpenMP and Tcmalloc Pull Request resolved: https://github.com/pytorch/pytorch/pull/120605 Approved by: https://github.com/leslie-fang-intel, https://github.com/jgong5, https://github.com/jerryzh168
2024-03-02 05:11:13 +00:00
BaseTest(
"test_dynamic_qlinear",
"cpu",
test_mkldnn_pattern_matcher.TestPatternMatcher(),
condition=torch.backends.mkldnn.is_available() and not IS_WINDOWS,
[Quant][Inductor] Enable lowering of dynamic qlinear for X86Inductor (#120605) **description** Enable lowering of dynamic qlinear for X86Inductor. The pattern is `choose_qparams -> getitem -> q -> dq -> linear`. We only fuse `dq -> linear` and get `choose_qparams -> getitem -> q -> onednn.qlinear_pointwise`. So, we treat it as dynamic quantization of activation + static quantized linear. The previous implementation of `onednn.qlinear_pointwise` is for the case where `x_scale` and `x_zp` are scalars. Since `choose_qparams` returns tensors, we added a variation `onednn.qlinear_pointwise.tensor` to support the case. This feature is targeting PyTorch 2.3 release. **Test plan** ``` python inductor/test_mkldnn_pattern_matcher.py -k test_dynamic_qlinear_cpu python inductor/test_mkldnn_pattern_matcher.py -k test_dynamic_qlinear_qat_cpu python inductor/test_cpu_cpp_wrapper.py -k test_dynamic_qlinear ``` **Performance before and after lowering `choose_qparam` to Inductor** Before - latency for shape (32, 32) = 0.151 ms latency for shape (128, 128) = 0.153 ms latency for shape (1024, 1024) = 0.247 ms After - latency for shape (32, 32) = 0.049 ms - latency for shape (128, 128) = 0.052 ms - latency for shape (1024, 1024) = 0.133 ms Test method: A module with a single Linear layer, dynamic-quantize, lower to X86Inductor Test env & config: Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz, single instance, single core, using Intel OpenMP and Tcmalloc Pull Request resolved: https://github.com/pytorch/pytorch/pull/120605 Approved by: https://github.com/leslie-fang-intel, https://github.com/jgong5, https://github.com/jerryzh168
2024-03-02 05:11:13 +00:00
),
BaseTest(
"test_dynamic_qlinear_qat",
"cpu",
test_mkldnn_pattern_matcher.TestPatternMatcher(),
condition=torch.backends.mkldnn.is_available() and not IS_WINDOWS,
),
BaseTest("test_randint"),
BaseTest("test_randn_with_dtype_and_device"),
BaseTest("test_reduction1"), # Reduction
BaseTest("test_relu"), # multiple inputs
BaseTest("test_repeat_interleave", "", test_cpu_repro.CPUReproTests()),
BaseTest("test_scalar_input"),
BaseTest("test_scalar_output"),
BaseTest("test_scaled_dot_product_attention"),
BaseTest("test_scatter1"),
BaseTest("test_scatter2"),
BaseTest("test_scatter3"),
BaseTest("test_scatter4"),
BaseTest("test_scatter5"),
BaseTest("test_scatter6"),
BaseTest("test_scatter_reduce1"),
BaseTest("test_scatter_reduce2"),
BaseTest("test_scatter_reduce3"),
BaseTest("test_silu"), # single input, single output
BaseTest("test_sort"),
BaseTest("test_sum_dtype"), # float64
BaseTest("test_sum_int"), # bool, int64, int8, uint8
BaseTest("test_tensor2"), # constant input
BaseTest(
"test_transpose", code_string_count={".reset();": 2}
), # multiple outputs, buffer clear
BaseTest("test_view_as_complex"),
BaseTest("test_view_as_real"),
]:
make_test_case(
item.name,
item.device,
item.tests,
item.condition,
item.slow,
item.func_inputs,
item.code_string_count,
)
test_torchinductor.copy_tests(
CppWrapperTemplate,
TestCppWrapper,
"cpp_wrapper",
test_failures_cpp_wrapper,
)
DynamicShapesCppWrapperTemplate = (
test_torchinductor_dynamic_shapes.make_dynamic_cls(CppWrapperTemplate)
)
test_torchinductor.copy_tests(
DynamicShapesCppWrapperTemplate,
DynamicShapesCppWrapperCpuTests,
"cpp_wrapper",
test_failures_cpp_wrapper,
xfail_prop="_expected_failure_dynamic_wrapper",
)
if __name__ == "__main__":
from torch._inductor.test_case import run_tests
if RUN_CPU:
run_tests(needs="filelock")