From 88c20eaef17c6a5b0b4f3e4277edfe0f969c52de Mon Sep 17 00:00:00 2001 From: Tracy Sharpe <42477615+tracysh@users.noreply.github.com> Date: Fri, 13 Mar 2020 22:45:51 -0700 Subject: [PATCH] MLAS: rename AVX512BW->AVX512Core (#3216) Cleanup change: remap functions and files with Avx512BW to Avx512Core. --- cmake/onnxruntime_mlas.cmake | 57 +++++++++---------- .../lib/amd64/QgemmU8S8KernelAvx512Common.inc | 2 +- ...12BW.asm => QgemmU8S8KernelAvx512Core.asm} | 6 +- .../lib/amd64/QgemmU8U8KernelAvx512Common.inc | 2 +- ...12BW.asm => QgemmU8U8KernelAvx512Core.asm} | 6 +- .../lib/amd64/QgemmU8X8KernelAvx512Common.inc | 4 +- .../lib/amd64/QgemvU8S8KernelAvx512Common.inc | 10 ++-- ...12BW.asm => QgemvU8S8KernelAvx512Core.asm} | 6 +- onnxruntime/core/mlas/lib/mlasi.h | 6 +- onnxruntime/core/mlas/lib/platform.cpp | 24 +++++--- .../lib/x86_64/QgemmU8S8KernelAvx512Common.h | 2 +- ...Avx512BW.S => QgemmU8S8KernelAvx512Core.S} | 6 +- .../lib/x86_64/QgemmU8U8KernelAvx512Common.h | 2 +- ...Avx512BW.S => QgemmU8U8KernelAvx512Core.S} | 6 +- .../lib/x86_64/QgemmU8X8KernelAvx512Common.h | 4 +- .../lib/x86_64/QgemvU8S8KernelAvx512Common.h | 10 ++-- ...Avx512BW.S => QgemvU8S8KernelAvx512Core.S} | 6 +- 17 files changed, 81 insertions(+), 78 deletions(-) rename onnxruntime/core/mlas/lib/amd64/{QgemmU8S8KernelAvx512BW.asm => QgemmU8S8KernelAvx512Core.asm} (96%) rename onnxruntime/core/mlas/lib/amd64/{QgemmU8U8KernelAvx512BW.asm => QgemmU8U8KernelAvx512Core.asm} (96%) rename onnxruntime/core/mlas/lib/amd64/{QgemvU8S8KernelAvx512BW.asm => QgemvU8S8KernelAvx512Core.asm} (73%) rename onnxruntime/core/mlas/lib/x86_64/{QgemmU8S8KernelAvx512BW.S => QgemmU8S8KernelAvx512Core.S} (97%) rename onnxruntime/core/mlas/lib/x86_64/{QgemmU8U8KernelAvx512BW.S => QgemmU8U8KernelAvx512Core.S} (97%) rename onnxruntime/core/mlas/lib/x86_64/{QgemvU8S8KernelAvx512BW.S => QgemvU8S8KernelAvx512Core.S} (74%) diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake index 38ec173dfc..50d1da0e59 100644 --- a/cmake/onnxruntime_mlas.cmake +++ b/cmake/onnxruntime_mlas.cmake @@ -48,12 +48,12 @@ if(MSVC) set(mlas_platform_srcs ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/QgemmU8S8KernelAvx2.asm ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/QgemvU8S8KernelAvx2.asm - ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/QgemmU8S8KernelAvx512BW.asm - ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/QgemvU8S8KernelAvx512BW.asm + ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/QgemmU8S8KernelAvx512Core.asm + ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/QgemvU8S8KernelAvx512Core.asm ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/QgemmU8S8KernelAvx512Vnni.asm ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/QgemvU8S8KernelAvx512Vnni.asm ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/QgemmU8U8KernelAvx2.asm - ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/QgemmU8U8KernelAvx512BW.asm + ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/QgemmU8U8KernelAvx512Core.asm ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/QgemmU8U8KernelAvx512Vnni.asm ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/DgemmKernelSse2.asm ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/DgemmKernelAvx.asm @@ -185,25 +185,24 @@ else() ) set_source_files_properties(${mlas_platform_srcs_avx2} PROPERTIES COMPILE_FLAGS "-mavx2 -mfma") - # Some platforms do not support AVX512 flags but still able to compile the source - # Others support the flag and refuse to compile without the flag. - # We have to run all 3 checks + # Some toolchains do not support AVX512 compiler flags but are still able + # to build the sources. Other toolchains require the AVX512 compiler flags + # to be specified. check_cxx_compiler_flag("-mavx512f" HAS_AVX512F) if(HAS_AVX512F) set(CMAKE_REQUIRED_FLAGS "-mavx512f") else() set(CMAKE_REQUIRED_FLAGS "") endif() - check_cxx_source_compiles(" int main() { asm(\"vpxord %zmm0,%zmm0,%zmm0\"); return 0; }" - AVX512F_COMPILES + COMPILES_AVX512F ) - if(AVX512F_COMPILES) + if(COMPILES_AVX512F) set(mlas_platform_srcs_avx512f ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/DgemmKernelAvx512F.S ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/SgemmKernelAvx512F.S @@ -214,46 +213,44 @@ else() set_source_files_properties(${mlas_platform_srcs_avx512f} PROPERTIES COMPILE_FLAGS "-mavx512f") endif() - # AVX512BW support is only available if AVX512F support is present. - check_cxx_compiler_flag("-mavx512bw" HAS_AVX512BW) - if(HAS_AVX512BW) - set(CMAKE_REQUIRED_FLAGS "-mavx512bw") + check_cxx_compiler_flag("-mavx512bw -mavx512dq -mavx512vl" HAS_AVX512CORE) + if(HAS_AVX512CORE) + set(CMAKE_REQUIRED_FLAGS "-mavx512bw -mavx512dq -mavx512vl") endif() check_cxx_source_compiles(" int main() { - asm(\"vpmaddwd %zmm0,%zmm0,%zmm0\"); + asm(\"vpmaddwd %zmm0,%zmm0,%zmm0\"); // AVX512BW feature + asm(\"vandnps %xmm31,%xmm31,%xmm31\"); // AVX512DQ/AVX512VL feature return 0; }" - AVX512BW_COMPILES + COMPILES_AVX512CORE ) - if(AVX512BW_COMPILES) - set(mlas_platform_srcs_avx512bw - ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/QgemmU8S8KernelAvx512BW.S - ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/QgemvU8S8KernelAvx512BW.S + if(COMPILES_AVX512CORE) + set(mlas_platform_srcs_avx512core + ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/QgemmU8S8KernelAvx512Core.S + ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/QgemvU8S8KernelAvx512Core.S ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/QgemmU8S8KernelAvx512Vnni.S ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/QgemvU8S8KernelAvx512Vnni.S - ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/QgemmU8U8KernelAvx512BW.S + ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/QgemmU8U8KernelAvx512Core.S ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/QgemmU8U8KernelAvx512Vnni.S ) - - if(HAS_AVX512BW) - set_source_files_properties(${mlas_platform_srcs_avx512bw} PROPERTIES COMPILE_FLAGS "-mavx512bw") + if(HAS_AVX512CORE) + set_source_files_properties(${mlas_platform_srcs_avx512core} PROPERTIES COMPILE_FLAGS "-mavx512bw -mavx512dq -mavx512vl") endif() - else() # AVX512BW_COMPILES - # - set_source_files_properties(${mlas_common_srcs} PROPERTIES COMPILE_FLAGS "-DMLAS_AVX512BW_UNSUPPORTED") - endif() # AVX512BW_COMPILES - else() # AVX512F_COMPILES + else() + set_source_files_properties(${mlas_common_srcs} PROPERTIES COMPILE_FLAGS "-DMLAS_AVX512CORE_UNSUPPORTED") + endif() + else() set_source_files_properties(${mlas_common_srcs} PROPERTIES COMPILE_FLAGS "-DMLAS_AVX512F_UNSUPPORTED") - endif() # AVX512F_COMPILES + endif() set(mlas_platform_srcs ${mlas_platform_srcs_sse2} ${mlas_platform_srcs_avx} ${mlas_platform_srcs_avx2} ${mlas_platform_srcs_avx512f} - ${mlas_platform_srcs_avx512bw} + ${mlas_platform_srcs_avx512core} ) endif() endif() diff --git a/onnxruntime/core/mlas/lib/amd64/QgemmU8S8KernelAvx512Common.inc b/onnxruntime/core/mlas/lib/amd64/QgemmU8S8KernelAvx512Common.inc index 7d1d4fb10f..aa132a7df9 100644 --- a/onnxruntime/core/mlas/lib/amd64/QgemmU8S8KernelAvx512Common.inc +++ b/onnxruntime/core/mlas/lib/amd64/QgemmU8S8KernelAvx512Common.inc @@ -11,7 +11,7 @@ ; Abstract: ; ; This module contains common kernel macros and structures for the quantized -; integer matrix/matrix multiply operation (QGEMM) for the AVX512BW and +; integer matrix/matrix multiply operation (QGEMM) for the AVX512 core and ; AVX512VNNI kernels. ; ;-- diff --git a/onnxruntime/core/mlas/lib/amd64/QgemmU8S8KernelAvx512BW.asm b/onnxruntime/core/mlas/lib/amd64/QgemmU8S8KernelAvx512Core.asm similarity index 96% rename from onnxruntime/core/mlas/lib/amd64/QgemmU8S8KernelAvx512BW.asm rename to onnxruntime/core/mlas/lib/amd64/QgemmU8S8KernelAvx512Core.asm index cb3b819476..f6c8d2d327 100644 --- a/onnxruntime/core/mlas/lib/amd64/QgemmU8S8KernelAvx512BW.asm +++ b/onnxruntime/core/mlas/lib/amd64/QgemmU8S8KernelAvx512Core.asm @@ -6,14 +6,14 @@ ; ; Module Name: ; -; QgemmU8S8KernelAvx512BW.asm +; QgemmU8S8KernelAvx512Core.asm ; ; Abstract: ; ; This module implements the kernels for the quantized integer matrix/matrix ; multiply operation (QGEMM). ; -; This implementation uses AVX512BW instructions. +; This implementation uses AVX512 core instructions (BW/DQ/VL). ; ;-- @@ -125,6 +125,6 @@ ENDIF ; Generate the GEMM kernel. ; -GemmU8X8KernelAvx512Function U8S8, Avx512BW +GemmU8X8KernelAvx512Function U8S8, Avx512Core END diff --git a/onnxruntime/core/mlas/lib/amd64/QgemmU8U8KernelAvx512Common.inc b/onnxruntime/core/mlas/lib/amd64/QgemmU8U8KernelAvx512Common.inc index b46cf32b9b..7afacbbdd4 100644 --- a/onnxruntime/core/mlas/lib/amd64/QgemmU8U8KernelAvx512Common.inc +++ b/onnxruntime/core/mlas/lib/amd64/QgemmU8U8KernelAvx512Common.inc @@ -11,7 +11,7 @@ ; Abstract: ; ; This module contains common kernel macros and structures for the quantized -; integer matrix/matrix multiply operation (QGEMM) for the AVX512BW and +; integer matrix/matrix multiply operation (QGEMM) for the AVX512 core and ; AVX512VNNI kernels. ; ;-- diff --git a/onnxruntime/core/mlas/lib/amd64/QgemmU8U8KernelAvx512BW.asm b/onnxruntime/core/mlas/lib/amd64/QgemmU8U8KernelAvx512Core.asm similarity index 96% rename from onnxruntime/core/mlas/lib/amd64/QgemmU8U8KernelAvx512BW.asm rename to onnxruntime/core/mlas/lib/amd64/QgemmU8U8KernelAvx512Core.asm index 25087fbafa..cd80710300 100644 --- a/onnxruntime/core/mlas/lib/amd64/QgemmU8U8KernelAvx512BW.asm +++ b/onnxruntime/core/mlas/lib/amd64/QgemmU8U8KernelAvx512Core.asm @@ -6,14 +6,14 @@ ; ; Module Name: ; -; QgemmU8U8KernelAvx512BW.asm +; QgemmU8U8KernelAvx512Core.asm ; ; Abstract: ; ; This module implements the kernels for the quantized integer matrix/matrix ; multiply operation (QGEMM). ; -; This implementation uses AVX512BW instructions. +; This implementation uses AVX512 core instructions (BW/DQ/VL). ; ;-- @@ -122,6 +122,6 @@ ENDIF ; Generate the GEMM kernel. ; -GemmU8X8KernelAvx512Function U8U8, Avx512BW +GemmU8X8KernelAvx512Function U8U8, Avx512Core END diff --git a/onnxruntime/core/mlas/lib/amd64/QgemmU8X8KernelAvx512Common.inc b/onnxruntime/core/mlas/lib/amd64/QgemmU8X8KernelAvx512Common.inc index 5b364facb8..259cce3d69 100644 --- a/onnxruntime/core/mlas/lib/amd64/QgemmU8X8KernelAvx512Common.inc +++ b/onnxruntime/core/mlas/lib/amd64/QgemmU8X8KernelAvx512Common.inc @@ -11,7 +11,7 @@ ; Abstract: ; ; This module contains common kernel macros and structures for the quantized -; integer matrix/matrix multiply operation (QGEMM) for the AVX512BW and +; integer matrix/matrix multiply operation (QGEMM) for the AVX512 core and ; AVX512VNNI kernels. ; ;-- @@ -369,7 +369,7 @@ GemmU8X8KernelAvx512Function MACRO Type, Isa mov esi,-1 kmovw k1,esi ; update mask to write all columns IFIDNI , -IFIDNI , +IFIDNI , neg esi vpbroadcastw zmm5,esi ; generate 512-bit word vector [0x0001] ENDIF diff --git a/onnxruntime/core/mlas/lib/amd64/QgemvU8S8KernelAvx512Common.inc b/onnxruntime/core/mlas/lib/amd64/QgemvU8S8KernelAvx512Common.inc index 49098ece72..a97cad9d90 100644 --- a/onnxruntime/core/mlas/lib/amd64/QgemvU8S8KernelAvx512Common.inc +++ b/onnxruntime/core/mlas/lib/amd64/QgemvU8S8KernelAvx512Common.inc @@ -11,7 +11,7 @@ ; Abstract: ; ; This module contains common kernel macros and structures for the quantized -; integer matrix/vector multiply operation (QGEMV) for the AVX512BW and +; integer matrix/vector multiply operation (QGEMV) for the AVX512 core and ; AVX512VNNI kernels. ; ;-- @@ -93,7 +93,7 @@ GemvU8S8KernelAvx512Function MACRO Isa kmovw k1,eax ; compute vector load/store mask mov rcx,GemvU8S8KernelFrame.ldb[rsp] mov r11,rsp ; set ZeroMode to any non-zero value -IFIDNI , +IFIDNI , mov eax,1 vpbroadcastw zmm29,eax ENDIF @@ -136,7 +136,7 @@ ProcessColumnLoop4By64: vpunpckhwd zmm17,zmm20,zmm22 vpunpcklwd zmm18,zmm21,zmm23 vpunpckhwd zmm19,zmm21,zmm23 -IFIDNI , +IFIDNI , vpmaddubsw zmm16,zmm28,zmm16 vpmaddwd zmm20,zmm16,zmm29 vpmaddubsw zmm17,zmm28,zmm17 @@ -248,7 +248,7 @@ ComputeOutput4By16: vinserti128 ymm5,ymm5,xmm1,1 ; concatenate 256-bit vector vinserti128 ymm3,ymm3,xmm2,1 vshufi32x4 zmm16,zmm5,zmm3,044h ; concatenate 512-bit vector -IFIDNI , +IFIDNI , vpmaddubsw zmm16,zmm28,zmm16 vpmaddwd zmm20,zmm16,zmm29 ELSE @@ -337,7 +337,7 @@ ComputeOutputSmallKBy16: vinserti128 ymm5,ymm5,xmm1,1 ; concatenate 256-bit vector vinserti128 ymm3,ymm3,xmm2,1 vshufi32x4 zmm16,zmm5,zmm3,044h ; concatenate 512-bit vector -IFIDNI , +IFIDNI , vpmaddubsw zmm16,zmm28,zmm16 vpmaddwd zmm20,zmm16,zmm29 ELSE diff --git a/onnxruntime/core/mlas/lib/amd64/QgemvU8S8KernelAvx512BW.asm b/onnxruntime/core/mlas/lib/amd64/QgemvU8S8KernelAvx512Core.asm similarity index 73% rename from onnxruntime/core/mlas/lib/amd64/QgemvU8S8KernelAvx512BW.asm rename to onnxruntime/core/mlas/lib/amd64/QgemvU8S8KernelAvx512Core.asm index 0a7fa01e46..c1727b3e34 100644 --- a/onnxruntime/core/mlas/lib/amd64/QgemvU8S8KernelAvx512BW.asm +++ b/onnxruntime/core/mlas/lib/amd64/QgemvU8S8KernelAvx512Core.asm @@ -6,14 +6,14 @@ ; ; Module Name: ; -; QgemvU8S8KernelAvx512BW.asm +; QgemvU8S8KernelAvx512Core.asm ; ; Abstract: ; ; This module implements the kernels for the quantized integer matrix/vector ; multiply operation (QGEMV). ; -; This implementation uses AVX512BW instructions. +; This implementation uses AVX512 core instructions (BW/DQ/VL). ; ;-- @@ -26,6 +26,6 @@ INCLUDE QgemvU8S8KernelAvx512Common.inc ; Generate the GEMV kernel. ; -GemvU8S8KernelAvx512Function Avx512BW +GemvU8S8KernelAvx512Function Avx512Core END diff --git a/onnxruntime/core/mlas/lib/mlasi.h b/onnxruntime/core/mlas/lib/mlasi.h index d11feb930c..293efda47c 100644 --- a/onnxruntime/core/mlas/lib/mlasi.h +++ b/onnxruntime/core/mlas/lib/mlasi.h @@ -493,14 +493,14 @@ extern "C" { MLAS_GEMM_U8S8_COPY_PACKB_ROUTINE MlasGemmU8S8CopyPackBAvx2; MLAS_GEMM_U8S8_KERNEL MlasGemmU8S8KernelAvx2; MLAS_GEMV_U8S8_KERNEL MlasGemvU8S8KernelAvx2; - MLAS_GEMM_U8S8_KERNEL MlasGemmU8S8KernelAvx512BW; - MLAS_GEMV_U8S8_KERNEL MlasGemvU8S8KernelAvx512BW; + MLAS_GEMM_U8S8_KERNEL MlasGemmU8S8KernelAvx512Core; + MLAS_GEMV_U8S8_KERNEL MlasGemvU8S8KernelAvx512Core; MLAS_GEMM_U8S8_KERNEL MlasGemmU8S8KernelAvx512Vnni; MLAS_GEMV_U8S8_KERNEL MlasGemvU8S8KernelAvx512Vnni; MLAS_GEMM_U8U8_COPY_PACKA_ROUTINE MlasGemmU8U8CopyPackAAvx2; MLAS_GEMM_U8U8_COPY_PACKB_ROUTINE MlasGemmU8U8CopyPackBAvx2; MLAS_GEMM_U8U8_KERNEL MlasGemmU8U8KernelAvx2; - MLAS_GEMM_U8U8_KERNEL MlasGemmU8U8KernelAvx512BW; + MLAS_GEMM_U8U8_KERNEL MlasGemmU8U8KernelAvx512Core; MLAS_GEMM_U8U8_KERNEL MlasGemmU8U8KernelAvx512Vnni; #endif #endif diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp index dd722be009..c74f040701 100644 --- a/onnxruntime/core/mlas/lib/platform.cpp +++ b/onnxruntime/core/mlas/lib/platform.cpp @@ -211,16 +211,19 @@ Return Value: this->PoolFloatKernel[MlasAveragePoolingIncludePad] = MlasPoolAverageIncludePadFloatKernelAvx512F; this->NchwcBlockSize = 16; this->PreferredBufferAlignment = 64; - // - // Check if the processor supports AVX512BW. - // -#if !defined(MLAS_AVX512BW_UNSUPPORTED) - if ((Cpuid7[1] & 0x40000000) != 0) { + // + // Check if the processor supports AVX512 core features + // (AVX512BW/AVX512DQ/AVX512VL). + // - this->GemmU8S8Kernel = MlasGemmU8S8KernelAvx512BW; - this->GemvU8S8Kernel = MlasGemvU8S8KernelAvx512BW; - this->GemmU8U8Kernel = MlasGemmU8U8KernelAvx512BW; +#if !defined(MLAS_AVX512CORE_UNSUPPORTED) + + if ((Cpuid7[1] & 0xC0020000) == 0xC0020000) { + + this->GemmU8S8Kernel = MlasGemmU8S8KernelAvx512Core; + this->GemvU8S8Kernel = MlasGemvU8S8KernelAvx512Core; + this->GemmU8U8Kernel = MlasGemmU8U8KernelAvx512Core; // // Check if the processor supports AVX512VNNI. @@ -233,8 +236,11 @@ Return Value: this->GemmU8U8Kernel = MlasGemmU8U8KernelAvx512Vnni; } } -#endif // MLAS_AVX512BW_UNSUPPORTED + +#endif // MLAS_AVX512CORE_UNSUPPORTED + } + #endif // MLAS_AVX512F_UNSUPPORTED } diff --git a/onnxruntime/core/mlas/lib/x86_64/QgemmU8S8KernelAvx512Common.h b/onnxruntime/core/mlas/lib/x86_64/QgemmU8S8KernelAvx512Common.h index 41437286a5..eb483c03fe 100644 --- a/onnxruntime/core/mlas/lib/x86_64/QgemmU8S8KernelAvx512Common.h +++ b/onnxruntime/core/mlas/lib/x86_64/QgemmU8S8KernelAvx512Common.h @@ -11,7 +11,7 @@ Module Name: Abstract: This module contains common kernel macros and structures for the quantized - integer matrix/matrix multiply operation (QGEMM) for the AVX512BW and + integer matrix/matrix multiply operation (QGEMM) for the AVX512 core and AVX512VNNI kernels. --*/ diff --git a/onnxruntime/core/mlas/lib/x86_64/QgemmU8S8KernelAvx512BW.S b/onnxruntime/core/mlas/lib/x86_64/QgemmU8S8KernelAvx512Core.S similarity index 97% rename from onnxruntime/core/mlas/lib/x86_64/QgemmU8S8KernelAvx512BW.S rename to onnxruntime/core/mlas/lib/x86_64/QgemmU8S8KernelAvx512Core.S index c994545362..c1f3300425 100644 --- a/onnxruntime/core/mlas/lib/x86_64/QgemmU8S8KernelAvx512BW.S +++ b/onnxruntime/core/mlas/lib/x86_64/QgemmU8S8KernelAvx512Core.S @@ -6,14 +6,14 @@ Licensed under the MIT License. Module Name: - QgemmU8S8KernelAvx512BW.s + QgemmU8S8KernelAvx512Core.s Abstract: This module implements the kernels for the quantized integer matrix/matrix multiply operation (QGEMM). - This implementation uses AVX512BW instructions. + This implementation uses AVX512 core instructions (BW/DQ/VL). --*/ @@ -131,6 +131,6 @@ Implicit Arguments: // Generate the GEMM kernel. // -GemmU8X8KernelAvx512Function U8S8, Avx512BW +GemmU8X8KernelAvx512Function U8S8, Avx512Core .end diff --git a/onnxruntime/core/mlas/lib/x86_64/QgemmU8U8KernelAvx512Common.h b/onnxruntime/core/mlas/lib/x86_64/QgemmU8U8KernelAvx512Common.h index 486dd5667b..9872817b3c 100644 --- a/onnxruntime/core/mlas/lib/x86_64/QgemmU8U8KernelAvx512Common.h +++ b/onnxruntime/core/mlas/lib/x86_64/QgemmU8U8KernelAvx512Common.h @@ -11,7 +11,7 @@ Module Name: Abstract: This module contains common kernel macros and structures for the quantized - integer matrix/matrix multiply operation (QGEMM) for the AVX512BW and + integer matrix/matrix multiply operation (QGEMM) for the AVX512 core and AVX512VNNI kernels. --*/ diff --git a/onnxruntime/core/mlas/lib/x86_64/QgemmU8U8KernelAvx512BW.S b/onnxruntime/core/mlas/lib/x86_64/QgemmU8U8KernelAvx512Core.S similarity index 97% rename from onnxruntime/core/mlas/lib/x86_64/QgemmU8U8KernelAvx512BW.S rename to onnxruntime/core/mlas/lib/x86_64/QgemmU8U8KernelAvx512Core.S index 1e251d94ab..d00021616a 100644 --- a/onnxruntime/core/mlas/lib/x86_64/QgemmU8U8KernelAvx512BW.S +++ b/onnxruntime/core/mlas/lib/x86_64/QgemmU8U8KernelAvx512Core.S @@ -6,14 +6,14 @@ Licensed under the MIT License. Module Name: - QgemmU8U8KernelAvx512BW.s + QgemmU8U8KernelAvx512Core.s Abstract: This module implements the kernels for the quantized integer matrix/matrix multiply operation (QGEMM). - This implementation uses AVX512BW instructions. + This implementation uses AVX512 core instructions (BW/DQ/VL). --*/ @@ -128,6 +128,6 @@ Implicit Arguments: // Generate the GEMM kernel. // -GemmU8X8KernelAvx512Function U8U8, Avx512BW +GemmU8X8KernelAvx512Function U8U8, Avx512Core .end diff --git a/onnxruntime/core/mlas/lib/x86_64/QgemmU8X8KernelAvx512Common.h b/onnxruntime/core/mlas/lib/x86_64/QgemmU8X8KernelAvx512Common.h index 18f82b15ad..1d700ed780 100644 --- a/onnxruntime/core/mlas/lib/x86_64/QgemmU8X8KernelAvx512Common.h +++ b/onnxruntime/core/mlas/lib/x86_64/QgemmU8X8KernelAvx512Common.h @@ -11,7 +11,7 @@ Module Name: Abstract: This module contains common kernel macros and structures for the quantized - integer matrix/matrix multiply operation (QGEMM) for the AVX512BW and + integer matrix/matrix multiply operation (QGEMM) for the AVX512 core and AVX512VNNI kernels. --*/ @@ -343,7 +343,7 @@ C_UNDERSCORE(MlasGemm\Type\()Kernel\Isa\()): mov ebp,-1 kmovw k1,ebp # update mask to write all columns .ifeqs "\Type\()", "U8S8" -.ifeqs "\Isa\()", "Avx512BW" +.ifeqs "\Isa\()", "Avx512Core" neg ebp vpbroadcastw zmm5,ebp # generate 512-bit word vector [0x0001] .endif diff --git a/onnxruntime/core/mlas/lib/x86_64/QgemvU8S8KernelAvx512Common.h b/onnxruntime/core/mlas/lib/x86_64/QgemvU8S8KernelAvx512Common.h index f3bc5d2526..c5a45c6cfe 100644 --- a/onnxruntime/core/mlas/lib/x86_64/QgemvU8S8KernelAvx512Common.h +++ b/onnxruntime/core/mlas/lib/x86_64/QgemvU8S8KernelAvx512Common.h @@ -11,7 +11,7 @@ Module Name: Abstract: This module contains common kernel macros and structures for the quantized - integer matrix/vector multiply operation (QGEMV) for the AVX512BW and + integer matrix/vector multiply operation (QGEMV) for the AVX512 core and AVX512VNNI kernels. --*/ @@ -83,7 +83,7 @@ C_UNDERSCORE(MlasGemvU8S8Kernel\Isa\()): mov rcx,rbx mov r10,rdx mov r11,rsp # set ZeroMode to any non-zero value -.ifeqs "\Isa\()", "Avx512BW" +.ifeqs "\Isa\()", "Avx512Core" mov eax,1 vpbroadcastw zmm29,eax .endif @@ -126,7 +126,7 @@ C_UNDERSCORE(MlasGemvU8S8Kernel\Isa\()): vpunpckhwd zmm17,zmm20,zmm22 vpunpcklwd zmm18,zmm21,zmm23 vpunpckhwd zmm19,zmm21,zmm23 -.ifeqs "\Isa\()", "Avx512BW" +.ifeqs "\Isa\()", "Avx512Core" vpmaddubsw zmm16,zmm28,zmm16 vpmaddwd zmm20,zmm16,zmm29 vpmaddubsw zmm17,zmm28,zmm17 @@ -234,7 +234,7 @@ C_UNDERSCORE(MlasGemvU8S8Kernel\Isa\()): vinserti128 ymm5,ymm5,xmm1,1 # concatenate 256-bit vector vinserti128 ymm3,ymm3,xmm2,1 vshufi32x4 zmm16,zmm5,zmm3,0x44 # concatenate 512-bit vector -.ifeqs "\Isa\()", "Avx512BW" +.ifeqs "\Isa\()", "Avx512Core" vpmaddubsw zmm16,zmm28,zmm16 vpmaddwd zmm20,zmm16,zmm29 .else @@ -323,7 +323,7 @@ C_UNDERSCORE(MlasGemvU8S8Kernel\Isa\()): vinserti128 ymm5,ymm5,xmm1,1 # concatenate 256-bit vector vinserti128 ymm3,ymm3,xmm2,1 vshufi32x4 zmm16,zmm5,zmm3,0x44 # concatenate 512-bit vector -.ifeqs "\Isa\()", "Avx512BW" +.ifeqs "\Isa\()", "Avx512Core" vpmaddubsw zmm16,zmm28,zmm16 vpmaddwd zmm20,zmm16,zmm29 .else diff --git a/onnxruntime/core/mlas/lib/x86_64/QgemvU8S8KernelAvx512BW.S b/onnxruntime/core/mlas/lib/x86_64/QgemvU8S8KernelAvx512Core.S similarity index 74% rename from onnxruntime/core/mlas/lib/x86_64/QgemvU8S8KernelAvx512BW.S rename to onnxruntime/core/mlas/lib/x86_64/QgemvU8S8KernelAvx512Core.S index 9d19fd99a3..841eed0d53 100644 --- a/onnxruntime/core/mlas/lib/x86_64/QgemvU8S8KernelAvx512BW.S +++ b/onnxruntime/core/mlas/lib/x86_64/QgemvU8S8KernelAvx512Core.S @@ -6,14 +6,14 @@ Licensed under the MIT License. Module Name: - QgemvU8S8KernelAvx512BW.s + QgemvU8S8KernelAvx512Core.s Abstract: This module implements the kernels for the quantized integer matrix/vector multiply operation (QGEMV). - This implementation uses AVX512BW instructions. + This implementation uses AVX512 core instructions (BW/DQ/VL). --*/ @@ -28,6 +28,6 @@ Abstract: // Generate the GEMV kernel. // -GemvU8S8KernelAvx512Function Avx512BW +GemvU8S8KernelAvx512Function Avx512Core .end