MLAS: rename AVX512BW->AVX512Core (#3216)

Cleanup change: remap functions and files with Avx512BW to Avx512Core.
2026-06-30 03:37:44 +00:00 · 2020-03-13 22:45:51 -07:00 · 2020-03-13 22:45:51 -07:00 · 88c20eaef1
commit 88c20eaef1
parent 2a6e5ce978
17 changed files with 81 additions and 78 deletions
--- a/cmake/onnxruntime_mlas.cmake
+++ b/cmake/onnxruntime_mlas.cmake
@ -48,12 +48,12 @@ if(MSVC)
    set(mlas_platform_srcs
      ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/QgemmU8S8KernelAvx2.asm
      ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/QgemvU8S8KernelAvx2.asm
-      ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/QgemmU8S8KernelAvx512BW.asm
-      ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/QgemvU8S8KernelAvx512BW.asm
+      ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/QgemmU8S8KernelAvx512Core.asm
+      ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/QgemvU8S8KernelAvx512Core.asm
      ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/QgemmU8S8KernelAvx512Vnni.asm
      ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/QgemvU8S8KernelAvx512Vnni.asm
      ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/QgemmU8U8KernelAvx2.asm
-      ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/QgemmU8U8KernelAvx512BW.asm
+      ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/QgemmU8U8KernelAvx512Core.asm
      ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/QgemmU8U8KernelAvx512Vnni.asm
      ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/DgemmKernelSse2.asm
      ${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/DgemmKernelAvx.asm
@ -185,25 +185,24 @@ else()
    )
    set_source_files_properties(${mlas_platform_srcs_avx2} PROPERTIES COMPILE_FLAGS "-mavx2 -mfma")

-    # Some platforms do not support AVX512 flags but still able to compile the source
-    # Others support the flag and refuse to compile without the flag.
-    # We have to run all 3 checks
+    # Some toolchains do not support AVX512 compiler flags but are still able
+    # to build the sources. Other toolchains require the AVX512 compiler flags
+    # to be specified.
    check_cxx_compiler_flag("-mavx512f" HAS_AVX512F)
    if(HAS_AVX512F)
      set(CMAKE_REQUIRED_FLAGS "-mavx512f")
    else()
      set(CMAKE_REQUIRED_FLAGS "")
    endif()
-
    check_cxx_source_compiles("
      int main() {
        asm(\"vpxord %zmm0,%zmm0,%zmm0\");
        return 0;
      }"
-      AVX512F_COMPILES
+      COMPILES_AVX512F
    )

-    if(AVX512F_COMPILES)
+    if(COMPILES_AVX512F)
      set(mlas_platform_srcs_avx512f
        ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/DgemmKernelAvx512F.S
        ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/SgemmKernelAvx512F.S
@ -214,46 +213,44 @@ else()
        set_source_files_properties(${mlas_platform_srcs_avx512f} PROPERTIES COMPILE_FLAGS "-mavx512f")
      endif()

-      # AVX512BW support is only available if AVX512F support is present.
-      check_cxx_compiler_flag("-mavx512bw" HAS_AVX512BW)
-      if(HAS_AVX512BW)
-        set(CMAKE_REQUIRED_FLAGS "-mavx512bw")
+      check_cxx_compiler_flag("-mavx512bw -mavx512dq -mavx512vl" HAS_AVX512CORE)
+      if(HAS_AVX512CORE)
+        set(CMAKE_REQUIRED_FLAGS "-mavx512bw -mavx512dq -mavx512vl")
      endif()
      check_cxx_source_compiles("
        int main() {
-          asm(\"vpmaddwd %zmm0,%zmm0,%zmm0\");
+          asm(\"vpmaddwd %zmm0,%zmm0,%zmm0\"); // AVX512BW feature
+          asm(\"vandnps %xmm31,%xmm31,%xmm31\"); // AVX512DQ/AVX512VL feature
          return 0;
        }"
-        AVX512BW_COMPILES
+        COMPILES_AVX512CORE
      )

-      if(AVX512BW_COMPILES)
-        set(mlas_platform_srcs_avx512bw
-          ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/QgemmU8S8KernelAvx512BW.S
-          ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/QgemvU8S8KernelAvx512BW.S
+      if(COMPILES_AVX512CORE)
+        set(mlas_platform_srcs_avx512core
+          ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/QgemmU8S8KernelAvx512Core.S
+          ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/QgemvU8S8KernelAvx512Core.S
          ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/QgemmU8S8KernelAvx512Vnni.S
          ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/QgemvU8S8KernelAvx512Vnni.S
-          ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/QgemmU8U8KernelAvx512BW.S
+          ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/QgemmU8U8KernelAvx512Core.S
          ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/QgemmU8U8KernelAvx512Vnni.S
        )
-
-        if(HAS_AVX512BW)
-          set_source_files_properties(${mlas_platform_srcs_avx512bw} PROPERTIES COMPILE_FLAGS "-mavx512bw")
+        if(HAS_AVX512CORE)
+          set_source_files_properties(${mlas_platform_srcs_avx512core} PROPERTIES COMPILE_FLAGS "-mavx512bw -mavx512dq -mavx512vl")
        endif()
-      else() # AVX512BW_COMPILES
-        #
-        set_source_files_properties(${mlas_common_srcs} PROPERTIES COMPILE_FLAGS "-DMLAS_AVX512BW_UNSUPPORTED")
-      endif() # AVX512BW_COMPILES
-    else() # AVX512F_COMPILES
+      else()
+        set_source_files_properties(${mlas_common_srcs} PROPERTIES COMPILE_FLAGS "-DMLAS_AVX512CORE_UNSUPPORTED")
+      endif()
+    else()
      set_source_files_properties(${mlas_common_srcs} PROPERTIES COMPILE_FLAGS "-DMLAS_AVX512F_UNSUPPORTED")
-    endif() # AVX512F_COMPILES
+    endif()

    set(mlas_platform_srcs
      ${mlas_platform_srcs_sse2}
      ${mlas_platform_srcs_avx}
      ${mlas_platform_srcs_avx2}
      ${mlas_platform_srcs_avx512f}
-      ${mlas_platform_srcs_avx512bw}
+      ${mlas_platform_srcs_avx512core}
    )
  endif()
 endif()
--- a/onnxruntime/core/mlas/lib/amd64/QgemmU8S8KernelAvx512Common.inc
+++ b/onnxruntime/core/mlas/lib/amd64/QgemmU8S8KernelAvx512Common.inc
@ -11,7 +11,7 @@
 ; Abstract:
 ;
 ;   This module contains common kernel macros and structures for the quantized
-;   integer matrix/matrix multiply operation (QGEMM) for the AVX512BW and
+;   integer matrix/matrix multiply operation (QGEMM) for the AVX512 core and
 ;   AVX512VNNI kernels.
 ;
 ;--
--- a/onnxruntime/core/mlas/lib/amd64/QgemmU8S8KernelAvx512Core.asm
+++ b/onnxruntime/core/mlas/lib/amd64/QgemmU8S8KernelAvx512Core.asm
@ -6,14 +6,14 @@
 ;
 ; Module Name:
 ;
-;   QgemmU8S8KernelAvx512BW.asm
+;   QgemmU8S8KernelAvx512Core.asm
 ;
 ; Abstract:
 ;
 ;   This module implements the kernels for the quantized integer matrix/matrix
 ;   multiply operation (QGEMM).
 ;
-;   This implementation uses AVX512BW instructions.
+;   This implementation uses AVX512 core instructions (BW/DQ/VL).
 ;
 ;--

@ -125,6 +125,6 @@ ENDIF
 ; Generate the GEMM kernel.
 ;

-GemmU8X8KernelAvx512Function U8S8, Avx512BW
+GemmU8X8KernelAvx512Function U8S8, Avx512Core

        END
--- a/onnxruntime/core/mlas/lib/amd64/QgemmU8U8KernelAvx512Common.inc
+++ b/onnxruntime/core/mlas/lib/amd64/QgemmU8U8KernelAvx512Common.inc
@ -11,7 +11,7 @@
 ; Abstract:
 ;
 ;   This module contains common kernel macros and structures for the quantized
-;   integer matrix/matrix multiply operation (QGEMM) for the AVX512BW and
+;   integer matrix/matrix multiply operation (QGEMM) for the AVX512 core and
 ;   AVX512VNNI kernels.
 ;
 ;--
--- a/onnxruntime/core/mlas/lib/amd64/QgemmU8U8KernelAvx512Core.asm
+++ b/onnxruntime/core/mlas/lib/amd64/QgemmU8U8KernelAvx512Core.asm
@ -6,14 +6,14 @@
 ;
 ; Module Name:
 ;
-;   QgemmU8U8KernelAvx512BW.asm
+;   QgemmU8U8KernelAvx512Core.asm
 ;
 ; Abstract:
 ;
 ;   This module implements the kernels for the quantized integer matrix/matrix
 ;   multiply operation (QGEMM).
 ;
-;   This implementation uses AVX512BW instructions.
+;   This implementation uses AVX512 core instructions (BW/DQ/VL).
 ;
 ;--

@ -122,6 +122,6 @@ ENDIF
 ; Generate the GEMM kernel.
 ;

-GemmU8X8KernelAvx512Function U8U8, Avx512BW
+GemmU8X8KernelAvx512Function U8U8, Avx512Core

        END
--- a/onnxruntime/core/mlas/lib/amd64/QgemmU8X8KernelAvx512Common.inc
+++ b/onnxruntime/core/mlas/lib/amd64/QgemmU8X8KernelAvx512Common.inc
@ -11,7 +11,7 @@
 ; Abstract:
 ;
 ;   This module contains common kernel macros and structures for the quantized
-;   integer matrix/matrix multiply operation (QGEMM) for the AVX512BW and
+;   integer matrix/matrix multiply operation (QGEMM) for the AVX512 core and
 ;   AVX512VNNI kernels.
 ;
 ;--
@ -369,7 +369,7 @@ GemmU8X8KernelAvx512Function MACRO Type, Isa
        mov     esi,-1
        kmovw   k1,esi                      ; update mask to write all columns
 IFIDNI <Type>, <U8S8>
-IFIDNI <Isa>, <Avx512BW>
+IFIDNI <Isa>, <Avx512Core>
        neg     esi
        vpbroadcastw zmm5,esi               ; generate 512-bit word vector [0x0001]
 ENDIF
--- a/onnxruntime/core/mlas/lib/amd64/QgemvU8S8KernelAvx512Common.inc
+++ b/onnxruntime/core/mlas/lib/amd64/QgemvU8S8KernelAvx512Common.inc
@ -11,7 +11,7 @@
 ; Abstract:
 ;
 ;   This module contains common kernel macros and structures for the quantized
-;   integer matrix/vector multiply operation (QGEMV) for the AVX512BW and
+;   integer matrix/vector multiply operation (QGEMV) for the AVX512 core and
 ;   AVX512VNNI kernels.
 ;
 ;--
@ -93,7 +93,7 @@ GemvU8S8KernelAvx512Function MACRO Isa
        kmovw   k1,eax                      ; compute vector load/store mask
        mov     rcx,GemvU8S8KernelFrame.ldb[rsp]
        mov     r11,rsp                     ; set ZeroMode to any non-zero value
-IFIDNI <Isa>, <Avx512BW>
+IFIDNI <Isa>, <Avx512Core>
        mov     eax,1
        vpbroadcastw zmm29,eax
 ENDIF
@ -136,7 +136,7 @@ ProcessColumnLoop4By64:
        vpunpckhwd zmm17,zmm20,zmm22
        vpunpcklwd zmm18,zmm21,zmm23
        vpunpckhwd zmm19,zmm21,zmm23
-IFIDNI <Isa>, <Avx512BW>
+IFIDNI <Isa>, <Avx512Core>
        vpmaddubsw zmm16,zmm28,zmm16
        vpmaddwd zmm20,zmm16,zmm29
        vpmaddubsw zmm17,zmm28,zmm17
@ -248,7 +248,7 @@ ComputeOutput4By16:
        vinserti128 ymm5,ymm5,xmm1,1        ; concatenate 256-bit vector
        vinserti128 ymm3,ymm3,xmm2,1
        vshufi32x4 zmm16,zmm5,zmm3,044h     ; concatenate 512-bit vector
-IFIDNI <Isa>, <Avx512BW>
+IFIDNI <Isa>, <Avx512Core>
        vpmaddubsw zmm16,zmm28,zmm16
        vpmaddwd zmm20,zmm16,zmm29
 ELSE
@ -337,7 +337,7 @@ ComputeOutputSmallKBy16:
        vinserti128 ymm5,ymm5,xmm1,1        ; concatenate 256-bit vector
        vinserti128 ymm3,ymm3,xmm2,1
        vshufi32x4 zmm16,zmm5,zmm3,044h     ; concatenate 512-bit vector
-IFIDNI <Isa>, <Avx512BW>
+IFIDNI <Isa>, <Avx512Core>
        vpmaddubsw zmm16,zmm28,zmm16
        vpmaddwd zmm20,zmm16,zmm29
 ELSE
--- a/onnxruntime/core/mlas/lib/amd64/QgemvU8S8KernelAvx512Core.asm
+++ b/onnxruntime/core/mlas/lib/amd64/QgemvU8S8KernelAvx512Core.asm
@ -6,14 +6,14 @@
 ;
 ; Module Name:
 ;
-;   QgemvU8S8KernelAvx512BW.asm
+;   QgemvU8S8KernelAvx512Core.asm
 ;
 ; Abstract:
 ;
 ;   This module implements the kernels for the quantized integer matrix/vector
 ;   multiply operation (QGEMV).
 ;
-;   This implementation uses AVX512BW instructions.
+;   This implementation uses AVX512 core instructions (BW/DQ/VL).
 ;
 ;--

@ -26,6 +26,6 @@ INCLUDE QgemvU8S8KernelAvx512Common.inc
 ; Generate the GEMV kernel.
 ;

-GemvU8S8KernelAvx512Function Avx512BW
+GemvU8S8KernelAvx512Function Avx512Core

        END
--- a/onnxruntime/core/mlas/lib/mlasi.h
+++ b/onnxruntime/core/mlas/lib/mlasi.h
@ -493,14 +493,14 @@ extern "C" {
    MLAS_GEMM_U8S8_COPY_PACKB_ROUTINE MlasGemmU8S8CopyPackBAvx2;
    MLAS_GEMM_U8S8_KERNEL MlasGemmU8S8KernelAvx2;
    MLAS_GEMV_U8S8_KERNEL MlasGemvU8S8KernelAvx2;
-    MLAS_GEMM_U8S8_KERNEL MlasGemmU8S8KernelAvx512BW;
-    MLAS_GEMV_U8S8_KERNEL MlasGemvU8S8KernelAvx512BW;
+    MLAS_GEMM_U8S8_KERNEL MlasGemmU8S8KernelAvx512Core;
+    MLAS_GEMV_U8S8_KERNEL MlasGemvU8S8KernelAvx512Core;
    MLAS_GEMM_U8S8_KERNEL MlasGemmU8S8KernelAvx512Vnni;
    MLAS_GEMV_U8S8_KERNEL MlasGemvU8S8KernelAvx512Vnni;
    MLAS_GEMM_U8U8_COPY_PACKA_ROUTINE MlasGemmU8U8CopyPackAAvx2;
    MLAS_GEMM_U8U8_COPY_PACKB_ROUTINE MlasGemmU8U8CopyPackBAvx2;
    MLAS_GEMM_U8U8_KERNEL MlasGemmU8U8KernelAvx2;
-    MLAS_GEMM_U8U8_KERNEL MlasGemmU8U8KernelAvx512BW;
+    MLAS_GEMM_U8U8_KERNEL MlasGemmU8U8KernelAvx512Core;
    MLAS_GEMM_U8U8_KERNEL MlasGemmU8U8KernelAvx512Vnni;
 #endif
 #endif
--- a/onnxruntime/core/mlas/lib/platform.cpp
+++ b/onnxruntime/core/mlas/lib/platform.cpp
@ -211,16 +211,19 @@ Return Value:
                    this->PoolFloatKernel[MlasAveragePoolingIncludePad] = MlasPoolAverageIncludePadFloatKernelAvx512F;
                    this->NchwcBlockSize = 16;
                    this->PreferredBufferAlignment = 64;
-                    //
-                    // Check if the processor supports AVX512BW.
-                    //
-#if !defined(MLAS_AVX512BW_UNSUPPORTED)

-                    if ((Cpuid7[1] & 0x40000000) != 0) {
+                    //
+                    // Check if the processor supports AVX512 core features
+                    // (AVX512BW/AVX512DQ/AVX512VL).
+                    //

-                        this->GemmU8S8Kernel = MlasGemmU8S8KernelAvx512BW;
-                        this->GemvU8S8Kernel = MlasGemvU8S8KernelAvx512BW;
-                        this->GemmU8U8Kernel = MlasGemmU8U8KernelAvx512BW;
+#if !defined(MLAS_AVX512CORE_UNSUPPORTED)
+
+                    if ((Cpuid7[1] & 0xC0020000) == 0xC0020000) {
+
+                        this->GemmU8S8Kernel = MlasGemmU8S8KernelAvx512Core;
+                        this->GemvU8S8Kernel = MlasGemvU8S8KernelAvx512Core;
+                        this->GemmU8U8Kernel = MlasGemmU8U8KernelAvx512Core;

                        //
                        // Check if the processor supports AVX512VNNI.
@ -233,8 +236,11 @@ Return Value:
                            this->GemmU8U8Kernel = MlasGemmU8U8KernelAvx512Vnni;
                        }
                    }
-#endif // MLAS_AVX512BW_UNSUPPORTED
+
+#endif // MLAS_AVX512CORE_UNSUPPORTED
+
                }
+
 #endif // MLAS_AVX512F_UNSUPPORTED

            }
--- a/onnxruntime/core/mlas/lib/x86_64/QgemmU8S8KernelAvx512Common.h
+++ b/onnxruntime/core/mlas/lib/x86_64/QgemmU8S8KernelAvx512Common.h
@ -11,7 +11,7 @@ Module Name:
 Abstract:

    This module contains common kernel macros and structures for the quantized
-    integer matrix/matrix multiply operation (QGEMM) for the AVX512BW and
+    integer matrix/matrix multiply operation (QGEMM) for the AVX512 core and
    AVX512VNNI kernels.

 --*/
--- a/onnxruntime/core/mlas/lib/x86_64/QgemmU8S8KernelAvx512Core.S
+++ b/onnxruntime/core/mlas/lib/x86_64/QgemmU8S8KernelAvx512Core.S
@ -6,14 +6,14 @@ Licensed under the MIT License.

 Module Name:

-    QgemmU8S8KernelAvx512BW.s
+    QgemmU8S8KernelAvx512Core.s

 Abstract:

    This module implements the kernels for the quantized integer matrix/matrix
    multiply operation (QGEMM).

-    This implementation uses AVX512BW instructions.
+    This implementation uses AVX512 core instructions (BW/DQ/VL).

 --*/

@ -131,6 +131,6 @@ Implicit Arguments:
 // Generate the GEMM kernel.
 //

-GemmU8X8KernelAvx512Function U8S8, Avx512BW
+GemmU8X8KernelAvx512Function U8S8, Avx512Core

        .end
--- a/onnxruntime/core/mlas/lib/x86_64/QgemmU8U8KernelAvx512Common.h
+++ b/onnxruntime/core/mlas/lib/x86_64/QgemmU8U8KernelAvx512Common.h
@ -11,7 +11,7 @@ Module Name:
 Abstract:

    This module contains common kernel macros and structures for the quantized
-    integer matrix/matrix multiply operation (QGEMM) for the AVX512BW and
+    integer matrix/matrix multiply operation (QGEMM) for the AVX512 core and
    AVX512VNNI kernels.

 --*/
--- a/onnxruntime/core/mlas/lib/x86_64/QgemmU8U8KernelAvx512Core.S
+++ b/onnxruntime/core/mlas/lib/x86_64/QgemmU8U8KernelAvx512Core.S
@ -6,14 +6,14 @@ Licensed under the MIT License.

 Module Name:

-    QgemmU8U8KernelAvx512BW.s
+    QgemmU8U8KernelAvx512Core.s

 Abstract:

    This module implements the kernels for the quantized integer matrix/matrix
    multiply operation (QGEMM).

-    This implementation uses AVX512BW instructions.
+    This implementation uses AVX512 core instructions (BW/DQ/VL).

 --*/

@ -128,6 +128,6 @@ Implicit Arguments:
 // Generate the GEMM kernel.
 //

-GemmU8X8KernelAvx512Function U8U8, Avx512BW
+GemmU8X8KernelAvx512Function U8U8, Avx512Core

        .end
--- a/onnxruntime/core/mlas/lib/x86_64/QgemmU8X8KernelAvx512Common.h
+++ b/onnxruntime/core/mlas/lib/x86_64/QgemmU8X8KernelAvx512Common.h
@ -11,7 +11,7 @@ Module Name:
 Abstract:

    This module contains common kernel macros and structures for the quantized
-    integer matrix/matrix multiply operation (QGEMM) for the AVX512BW and
+    integer matrix/matrix multiply operation (QGEMM) for the AVX512 core and
    AVX512VNNI kernels.

 --*/
@ -343,7 +343,7 @@ C_UNDERSCORE(MlasGemm\Type\()Kernel\Isa\()):
        mov     ebp,-1
        kmovw   k1,ebp                      # update mask to write all columns
 .ifeqs "\Type\()", "U8S8"
-.ifeqs "\Isa\()", "Avx512BW"
+.ifeqs "\Isa\()", "Avx512Core"
        neg     ebp
        vpbroadcastw zmm5,ebp               # generate 512-bit word vector [0x0001]
 .endif
--- a/onnxruntime/core/mlas/lib/x86_64/QgemvU8S8KernelAvx512Common.h
+++ b/onnxruntime/core/mlas/lib/x86_64/QgemvU8S8KernelAvx512Common.h
@ -11,7 +11,7 @@ Module Name:
 Abstract:

    This module contains common kernel macros and structures for the quantized
-    integer matrix/vector multiply operation (QGEMV) for the AVX512BW and
+    integer matrix/vector multiply operation (QGEMV) for the AVX512 core and
    AVX512VNNI kernels.

 --*/
@ -83,7 +83,7 @@ C_UNDERSCORE(MlasGemvU8S8Kernel\Isa\()):
        mov     rcx,rbx
        mov     r10,rdx
        mov     r11,rsp                     # set ZeroMode to any non-zero value
-.ifeqs "\Isa\()", "Avx512BW"
+.ifeqs "\Isa\()", "Avx512Core"
        mov     eax,1
        vpbroadcastw zmm29,eax
 .endif
@ -126,7 +126,7 @@ C_UNDERSCORE(MlasGemvU8S8Kernel\Isa\()):
        vpunpckhwd zmm17,zmm20,zmm22
        vpunpcklwd zmm18,zmm21,zmm23
        vpunpckhwd zmm19,zmm21,zmm23
-.ifeqs "\Isa\()", "Avx512BW"
+.ifeqs "\Isa\()", "Avx512Core"
        vpmaddubsw zmm16,zmm28,zmm16
        vpmaddwd zmm20,zmm16,zmm29
        vpmaddubsw zmm17,zmm28,zmm17
@ -234,7 +234,7 @@ C_UNDERSCORE(MlasGemvU8S8Kernel\Isa\()):
        vinserti128 ymm5,ymm5,xmm1,1        # concatenate 256-bit vector
        vinserti128 ymm3,ymm3,xmm2,1
        vshufi32x4 zmm16,zmm5,zmm3,0x44     # concatenate 512-bit vector
-.ifeqs "\Isa\()", "Avx512BW"
+.ifeqs "\Isa\()", "Avx512Core"
        vpmaddubsw zmm16,zmm28,zmm16
        vpmaddwd zmm20,zmm16,zmm29
 .else
@ -323,7 +323,7 @@ C_UNDERSCORE(MlasGemvU8S8Kernel\Isa\()):
        vinserti128 ymm5,ymm5,xmm1,1        # concatenate 256-bit vector
        vinserti128 ymm3,ymm3,xmm2,1
        vshufi32x4 zmm16,zmm5,zmm3,0x44     # concatenate 512-bit vector
-.ifeqs "\Isa\()", "Avx512BW"
+.ifeqs "\Isa\()", "Avx512Core"
        vpmaddubsw zmm16,zmm28,zmm16
        vpmaddwd zmm20,zmm16,zmm29
 .else
--- a/onnxruntime/core/mlas/lib/x86_64/QgemvU8S8KernelAvx512Core.S
+++ b/onnxruntime/core/mlas/lib/x86_64/QgemvU8S8KernelAvx512Core.S
@ -6,14 +6,14 @@ Licensed under the MIT License.

 Module Name:

-    QgemvU8S8KernelAvx512BW.s
+    QgemvU8S8KernelAvx512Core.s

 Abstract:

    This module implements the kernels for the quantized integer matrix/vector
    multiply operation (QGEMV).

-    This implementation uses AVX512BW instructions.
+    This implementation uses AVX512 core instructions (BW/DQ/VL).

 --*/

@ -28,6 +28,6 @@ Abstract:
 // Generate the GEMV kernel.
 //

-GemvU8S8KernelAvx512Function Avx512BW
+GemvU8S8KernelAvx512Function Avx512Core

        .end