POWER10: QGEMM optimization (#10642)

* POWER10: QGEMM optimization This patch makes use of POWER10 MMA feature for QGEMM function. This optimization includes signed and unsigned cases.Tested and there are no new failures with gcc11 and clang-14. * Changes as per review comments Co-authored-by: Rajalakshmi Srinivasaraghavan <rajis@linux.ibm.com>
2026-05-17 21:10:43 +00:00 · 2022-03-02 10:36:26 -06:00 · 2022-03-02 10:36:26 -06:00 · 5d8c5409ab
commit 5d8c5409ab
parent e5c6dc1fc8
5 changed files with 1205 additions and 0 deletions
--- a/cmake/onnxruntime_mlas.cmake
+++ b/cmake/onnxruntime_mlas.cmake
@ -334,13 +334,16 @@ else()
            )
            if (HAS_P10_RUNTIME)
              set_source_files_properties(${MLAS_SRC_DIR}/platform.cpp PROPERTIES COMPILE_FLAGS "-DPOWER10")
+              set_source_files_properties(${MLAS_SRC_DIR}/qgemm.cpp PROPERTIES COMPILE_FLAGS "-DPOWER10")
            endif()
            set(mlas_platform_srcs_power10
              ${MLAS_SRC_DIR}/power/SgemmKernelPOWER10.cpp
              ${MLAS_SRC_DIR}/power/DgemmKernelPOWER10.cpp
+              ${MLAS_SRC_DIR}/power/qgemm_kernel_power10.cpp
            )
            set_source_files_properties(${MLAS_SRC_DIR}/power/SgemmKernelPOWER10.cpp PROPERTIES COMPILE_FLAGS "-O2 -mcpu=power10 -DSINGLE")
            set_source_files_properties(${MLAS_SRC_DIR}/power/DgemmKernelPOWER10.cpp PROPERTIES COMPILE_FLAGS "-O2 -mcpu=power10")
+            set_source_files_properties(${MLAS_SRC_DIR}/power/qgemm_kernel_power10.cpp PROPERTIES COMPILE_FLAGS "-O3 -mcpu=power10")
            set(mlas_platform_srcs
              ${mlas_platform_srcs}
              ${mlas_platform_srcs_power10}
--- a/onnxruntime/core/mlas/lib/mlasi.h
+++ b/onnxruntime/core/mlas/lib/mlasi.h
@ -704,6 +704,7 @@ extern const MLAS_GEMM_QUANT_DISPATCH MlasGemmU8X8DispatchUdot;
 extern const MLAS_GEMM_QUANT_DISPATCH MlasGemmS8S8DispatchSdot;
 extern const MLAS_GEMM_QUANT_DISPATCH MlasGemmU8X8DispatchWasmSimd;
 extern const MLAS_GEMM_QUANT_DISPATCH MlasGemmQuantDispatchDefault;
+extern const MLAS_GEMM_QUANT_DISPATCH MlasGemm8X8DispatchPOWER10;

 //
 // Symmetric quantized qgemm dispatch structure
@ -849,6 +850,7 @@ struct MLAS_PLATFORM {

 #if defined(MLAS_TARGET_POWER)
    MLAS_GEMM_DOUBLE_KERNEL* GemmDoubleKernel;
+    const MLAS_GEMM_QUANT_DISPATCH* GemmU8X8Dispatch;
 #endif
 #if defined(MLAS_TARGET_AMD64)
    MLAS_SGEMM_KERNEL_M1_ROUTINE* KernelM1Routine;
--- a/onnxruntime/core/mlas/lib/platform.cpp
+++ b/onnxruntime/core/mlas/lib/platform.cpp
@ -392,6 +392,7 @@ Return Value:
    if (HasP10Instructions) {
        this->GemmFloatKernel = MlasSgemmKernelPOWER10;
        this->GemmDoubleKernel = MlasDgemmKernelPOWER10;
+        this->GemmU8X8Dispatch = &MlasGemm8X8DispatchPOWER10;
    }
 #endif
 #endif
--- a/onnxruntime/core/mlas/lib/power/qgemm_kernel_power10.cpp
+++ b/onnxruntime/core/mlas/lib/power/qgemm_kernel_power10.cpp
--- a/onnxruntime/core/mlas/lib/qgemm.h
+++ b/onnxruntime/core/mlas/lib/qgemm.h
@ -849,6 +849,12 @@ MlasGemmQuantGetDispatch(
    if (!AIsSigned) {
        GemmQuantDispatch = &MlasGemmU8X8DispatchWasmSimd;
    }
+#elif defined(MLAS_TARGET_POWER) && defined(__linux__)  && defined(POWER10) && \
+    ((defined(__GNUC__) && ((__GNUC__ > 10) || (__GNUC__== 10 && __GNUC_MINOR__ >= 2))) || \
+    (defined(__clang__) && (__clang_major__ >= 12)))
+    if (GetMlasPlatform().GemmU8X8Dispatch == &MlasGemm8X8DispatchPOWER10) {
+        GemmQuantDispatch = GetMlasPlatform().GemmU8X8Dispatch;
+    }
 #endif

    if (nullptr == GemmQuantDispatch) {