mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-06-30 03:37:44 +00:00
MLAS: rename AVX512BW->AVX512Core (#3216)
Cleanup change: remap functions and files with Avx512BW to Avx512Core.
This commit is contained in:
parent
2a6e5ce978
commit
88c20eaef1
17 changed files with 81 additions and 78 deletions
|
|
@ -48,12 +48,12 @@ if(MSVC)
|
|||
set(mlas_platform_srcs
|
||||
${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/QgemmU8S8KernelAvx2.asm
|
||||
${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/QgemvU8S8KernelAvx2.asm
|
||||
${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/QgemmU8S8KernelAvx512BW.asm
|
||||
${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/QgemvU8S8KernelAvx512BW.asm
|
||||
${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/QgemmU8S8KernelAvx512Core.asm
|
||||
${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/QgemvU8S8KernelAvx512Core.asm
|
||||
${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/QgemmU8S8KernelAvx512Vnni.asm
|
||||
${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/QgemvU8S8KernelAvx512Vnni.asm
|
||||
${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/QgemmU8U8KernelAvx2.asm
|
||||
${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/QgemmU8U8KernelAvx512BW.asm
|
||||
${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/QgemmU8U8KernelAvx512Core.asm
|
||||
${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/QgemmU8U8KernelAvx512Vnni.asm
|
||||
${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/DgemmKernelSse2.asm
|
||||
${ONNXRUNTIME_ROOT}/core/mlas/lib/amd64/DgemmKernelAvx.asm
|
||||
|
|
@ -185,25 +185,24 @@ else()
|
|||
)
|
||||
set_source_files_properties(${mlas_platform_srcs_avx2} PROPERTIES COMPILE_FLAGS "-mavx2 -mfma")
|
||||
|
||||
# Some platforms do not support AVX512 flags but still able to compile the source
|
||||
# Others support the flag and refuse to compile without the flag.
|
||||
# We have to run all 3 checks
|
||||
# Some toolchains do not support AVX512 compiler flags but are still able
|
||||
# to build the sources. Other toolchains require the AVX512 compiler flags
|
||||
# to be specified.
|
||||
check_cxx_compiler_flag("-mavx512f" HAS_AVX512F)
|
||||
if(HAS_AVX512F)
|
||||
set(CMAKE_REQUIRED_FLAGS "-mavx512f")
|
||||
else()
|
||||
set(CMAKE_REQUIRED_FLAGS "")
|
||||
endif()
|
||||
|
||||
check_cxx_source_compiles("
|
||||
int main() {
|
||||
asm(\"vpxord %zmm0,%zmm0,%zmm0\");
|
||||
return 0;
|
||||
}"
|
||||
AVX512F_COMPILES
|
||||
COMPILES_AVX512F
|
||||
)
|
||||
|
||||
if(AVX512F_COMPILES)
|
||||
if(COMPILES_AVX512F)
|
||||
set(mlas_platform_srcs_avx512f
|
||||
${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/DgemmKernelAvx512F.S
|
||||
${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/SgemmKernelAvx512F.S
|
||||
|
|
@ -214,46 +213,44 @@ else()
|
|||
set_source_files_properties(${mlas_platform_srcs_avx512f} PROPERTIES COMPILE_FLAGS "-mavx512f")
|
||||
endif()
|
||||
|
||||
# AVX512BW support is only available if AVX512F support is present.
|
||||
check_cxx_compiler_flag("-mavx512bw" HAS_AVX512BW)
|
||||
if(HAS_AVX512BW)
|
||||
set(CMAKE_REQUIRED_FLAGS "-mavx512bw")
|
||||
check_cxx_compiler_flag("-mavx512bw -mavx512dq -mavx512vl" HAS_AVX512CORE)
|
||||
if(HAS_AVX512CORE)
|
||||
set(CMAKE_REQUIRED_FLAGS "-mavx512bw -mavx512dq -mavx512vl")
|
||||
endif()
|
||||
check_cxx_source_compiles("
|
||||
int main() {
|
||||
asm(\"vpmaddwd %zmm0,%zmm0,%zmm0\");
|
||||
asm(\"vpmaddwd %zmm0,%zmm0,%zmm0\"); // AVX512BW feature
|
||||
asm(\"vandnps %xmm31,%xmm31,%xmm31\"); // AVX512DQ/AVX512VL feature
|
||||
return 0;
|
||||
}"
|
||||
AVX512BW_COMPILES
|
||||
COMPILES_AVX512CORE
|
||||
)
|
||||
|
||||
if(AVX512BW_COMPILES)
|
||||
set(mlas_platform_srcs_avx512bw
|
||||
${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/QgemmU8S8KernelAvx512BW.S
|
||||
${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/QgemvU8S8KernelAvx512BW.S
|
||||
if(COMPILES_AVX512CORE)
|
||||
set(mlas_platform_srcs_avx512core
|
||||
${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/QgemmU8S8KernelAvx512Core.S
|
||||
${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/QgemvU8S8KernelAvx512Core.S
|
||||
${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/QgemmU8S8KernelAvx512Vnni.S
|
||||
${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/QgemvU8S8KernelAvx512Vnni.S
|
||||
${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/QgemmU8U8KernelAvx512BW.S
|
||||
${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/QgemmU8U8KernelAvx512Core.S
|
||||
${ONNXRUNTIME_ROOT}/core/mlas/lib/x86_64/QgemmU8U8KernelAvx512Vnni.S
|
||||
)
|
||||
|
||||
if(HAS_AVX512BW)
|
||||
set_source_files_properties(${mlas_platform_srcs_avx512bw} PROPERTIES COMPILE_FLAGS "-mavx512bw")
|
||||
if(HAS_AVX512CORE)
|
||||
set_source_files_properties(${mlas_platform_srcs_avx512core} PROPERTIES COMPILE_FLAGS "-mavx512bw -mavx512dq -mavx512vl")
|
||||
endif()
|
||||
else() # AVX512BW_COMPILES
|
||||
#
|
||||
set_source_files_properties(${mlas_common_srcs} PROPERTIES COMPILE_FLAGS "-DMLAS_AVX512BW_UNSUPPORTED")
|
||||
endif() # AVX512BW_COMPILES
|
||||
else() # AVX512F_COMPILES
|
||||
else()
|
||||
set_source_files_properties(${mlas_common_srcs} PROPERTIES COMPILE_FLAGS "-DMLAS_AVX512CORE_UNSUPPORTED")
|
||||
endif()
|
||||
else()
|
||||
set_source_files_properties(${mlas_common_srcs} PROPERTIES COMPILE_FLAGS "-DMLAS_AVX512F_UNSUPPORTED")
|
||||
endif() # AVX512F_COMPILES
|
||||
endif()
|
||||
|
||||
set(mlas_platform_srcs
|
||||
${mlas_platform_srcs_sse2}
|
||||
${mlas_platform_srcs_avx}
|
||||
${mlas_platform_srcs_avx2}
|
||||
${mlas_platform_srcs_avx512f}
|
||||
${mlas_platform_srcs_avx512bw}
|
||||
${mlas_platform_srcs_avx512core}
|
||||
)
|
||||
endif()
|
||||
endif()
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@
|
|||
; Abstract:
|
||||
;
|
||||
; This module contains common kernel macros and structures for the quantized
|
||||
; integer matrix/matrix multiply operation (QGEMM) for the AVX512BW and
|
||||
; integer matrix/matrix multiply operation (QGEMM) for the AVX512 core and
|
||||
; AVX512VNNI kernels.
|
||||
;
|
||||
;--
|
||||
|
|
|
|||
|
|
@ -6,14 +6,14 @@
|
|||
;
|
||||
; Module Name:
|
||||
;
|
||||
; QgemmU8S8KernelAvx512BW.asm
|
||||
; QgemmU8S8KernelAvx512Core.asm
|
||||
;
|
||||
; Abstract:
|
||||
;
|
||||
; This module implements the kernels for the quantized integer matrix/matrix
|
||||
; multiply operation (QGEMM).
|
||||
;
|
||||
; This implementation uses AVX512BW instructions.
|
||||
; This implementation uses AVX512 core instructions (BW/DQ/VL).
|
||||
;
|
||||
;--
|
||||
|
||||
|
|
@ -125,6 +125,6 @@ ENDIF
|
|||
; Generate the GEMM kernel.
|
||||
;
|
||||
|
||||
GemmU8X8KernelAvx512Function U8S8, Avx512BW
|
||||
GemmU8X8KernelAvx512Function U8S8, Avx512Core
|
||||
|
||||
END
|
||||
|
|
@ -11,7 +11,7 @@
|
|||
; Abstract:
|
||||
;
|
||||
; This module contains common kernel macros and structures for the quantized
|
||||
; integer matrix/matrix multiply operation (QGEMM) for the AVX512BW and
|
||||
; integer matrix/matrix multiply operation (QGEMM) for the AVX512 core and
|
||||
; AVX512VNNI kernels.
|
||||
;
|
||||
;--
|
||||
|
|
|
|||
|
|
@ -6,14 +6,14 @@
|
|||
;
|
||||
; Module Name:
|
||||
;
|
||||
; QgemmU8U8KernelAvx512BW.asm
|
||||
; QgemmU8U8KernelAvx512Core.asm
|
||||
;
|
||||
; Abstract:
|
||||
;
|
||||
; This module implements the kernels for the quantized integer matrix/matrix
|
||||
; multiply operation (QGEMM).
|
||||
;
|
||||
; This implementation uses AVX512BW instructions.
|
||||
; This implementation uses AVX512 core instructions (BW/DQ/VL).
|
||||
;
|
||||
;--
|
||||
|
||||
|
|
@ -122,6 +122,6 @@ ENDIF
|
|||
; Generate the GEMM kernel.
|
||||
;
|
||||
|
||||
GemmU8X8KernelAvx512Function U8U8, Avx512BW
|
||||
GemmU8X8KernelAvx512Function U8U8, Avx512Core
|
||||
|
||||
END
|
||||
|
|
@ -11,7 +11,7 @@
|
|||
; Abstract:
|
||||
;
|
||||
; This module contains common kernel macros and structures for the quantized
|
||||
; integer matrix/matrix multiply operation (QGEMM) for the AVX512BW and
|
||||
; integer matrix/matrix multiply operation (QGEMM) for the AVX512 core and
|
||||
; AVX512VNNI kernels.
|
||||
;
|
||||
;--
|
||||
|
|
@ -369,7 +369,7 @@ GemmU8X8KernelAvx512Function MACRO Type, Isa
|
|||
mov esi,-1
|
||||
kmovw k1,esi ; update mask to write all columns
|
||||
IFIDNI <Type>, <U8S8>
|
||||
IFIDNI <Isa>, <Avx512BW>
|
||||
IFIDNI <Isa>, <Avx512Core>
|
||||
neg esi
|
||||
vpbroadcastw zmm5,esi ; generate 512-bit word vector [0x0001]
|
||||
ENDIF
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@
|
|||
; Abstract:
|
||||
;
|
||||
; This module contains common kernel macros and structures for the quantized
|
||||
; integer matrix/vector multiply operation (QGEMV) for the AVX512BW and
|
||||
; integer matrix/vector multiply operation (QGEMV) for the AVX512 core and
|
||||
; AVX512VNNI kernels.
|
||||
;
|
||||
;--
|
||||
|
|
@ -93,7 +93,7 @@ GemvU8S8KernelAvx512Function MACRO Isa
|
|||
kmovw k1,eax ; compute vector load/store mask
|
||||
mov rcx,GemvU8S8KernelFrame.ldb[rsp]
|
||||
mov r11,rsp ; set ZeroMode to any non-zero value
|
||||
IFIDNI <Isa>, <Avx512BW>
|
||||
IFIDNI <Isa>, <Avx512Core>
|
||||
mov eax,1
|
||||
vpbroadcastw zmm29,eax
|
||||
ENDIF
|
||||
|
|
@ -136,7 +136,7 @@ ProcessColumnLoop4By64:
|
|||
vpunpckhwd zmm17,zmm20,zmm22
|
||||
vpunpcklwd zmm18,zmm21,zmm23
|
||||
vpunpckhwd zmm19,zmm21,zmm23
|
||||
IFIDNI <Isa>, <Avx512BW>
|
||||
IFIDNI <Isa>, <Avx512Core>
|
||||
vpmaddubsw zmm16,zmm28,zmm16
|
||||
vpmaddwd zmm20,zmm16,zmm29
|
||||
vpmaddubsw zmm17,zmm28,zmm17
|
||||
|
|
@ -248,7 +248,7 @@ ComputeOutput4By16:
|
|||
vinserti128 ymm5,ymm5,xmm1,1 ; concatenate 256-bit vector
|
||||
vinserti128 ymm3,ymm3,xmm2,1
|
||||
vshufi32x4 zmm16,zmm5,zmm3,044h ; concatenate 512-bit vector
|
||||
IFIDNI <Isa>, <Avx512BW>
|
||||
IFIDNI <Isa>, <Avx512Core>
|
||||
vpmaddubsw zmm16,zmm28,zmm16
|
||||
vpmaddwd zmm20,zmm16,zmm29
|
||||
ELSE
|
||||
|
|
@ -337,7 +337,7 @@ ComputeOutputSmallKBy16:
|
|||
vinserti128 ymm5,ymm5,xmm1,1 ; concatenate 256-bit vector
|
||||
vinserti128 ymm3,ymm3,xmm2,1
|
||||
vshufi32x4 zmm16,zmm5,zmm3,044h ; concatenate 512-bit vector
|
||||
IFIDNI <Isa>, <Avx512BW>
|
||||
IFIDNI <Isa>, <Avx512Core>
|
||||
vpmaddubsw zmm16,zmm28,zmm16
|
||||
vpmaddwd zmm20,zmm16,zmm29
|
||||
ELSE
|
||||
|
|
|
|||
|
|
@ -6,14 +6,14 @@
|
|||
;
|
||||
; Module Name:
|
||||
;
|
||||
; QgemvU8S8KernelAvx512BW.asm
|
||||
; QgemvU8S8KernelAvx512Core.asm
|
||||
;
|
||||
; Abstract:
|
||||
;
|
||||
; This module implements the kernels for the quantized integer matrix/vector
|
||||
; multiply operation (QGEMV).
|
||||
;
|
||||
; This implementation uses AVX512BW instructions.
|
||||
; This implementation uses AVX512 core instructions (BW/DQ/VL).
|
||||
;
|
||||
;--
|
||||
|
||||
|
|
@ -26,6 +26,6 @@ INCLUDE QgemvU8S8KernelAvx512Common.inc
|
|||
; Generate the GEMV kernel.
|
||||
;
|
||||
|
||||
GemvU8S8KernelAvx512Function Avx512BW
|
||||
GemvU8S8KernelAvx512Function Avx512Core
|
||||
|
||||
END
|
||||
|
|
@ -493,14 +493,14 @@ extern "C" {
|
|||
MLAS_GEMM_U8S8_COPY_PACKB_ROUTINE MlasGemmU8S8CopyPackBAvx2;
|
||||
MLAS_GEMM_U8S8_KERNEL MlasGemmU8S8KernelAvx2;
|
||||
MLAS_GEMV_U8S8_KERNEL MlasGemvU8S8KernelAvx2;
|
||||
MLAS_GEMM_U8S8_KERNEL MlasGemmU8S8KernelAvx512BW;
|
||||
MLAS_GEMV_U8S8_KERNEL MlasGemvU8S8KernelAvx512BW;
|
||||
MLAS_GEMM_U8S8_KERNEL MlasGemmU8S8KernelAvx512Core;
|
||||
MLAS_GEMV_U8S8_KERNEL MlasGemvU8S8KernelAvx512Core;
|
||||
MLAS_GEMM_U8S8_KERNEL MlasGemmU8S8KernelAvx512Vnni;
|
||||
MLAS_GEMV_U8S8_KERNEL MlasGemvU8S8KernelAvx512Vnni;
|
||||
MLAS_GEMM_U8U8_COPY_PACKA_ROUTINE MlasGemmU8U8CopyPackAAvx2;
|
||||
MLAS_GEMM_U8U8_COPY_PACKB_ROUTINE MlasGemmU8U8CopyPackBAvx2;
|
||||
MLAS_GEMM_U8U8_KERNEL MlasGemmU8U8KernelAvx2;
|
||||
MLAS_GEMM_U8U8_KERNEL MlasGemmU8U8KernelAvx512BW;
|
||||
MLAS_GEMM_U8U8_KERNEL MlasGemmU8U8KernelAvx512Core;
|
||||
MLAS_GEMM_U8U8_KERNEL MlasGemmU8U8KernelAvx512Vnni;
|
||||
#endif
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -211,16 +211,19 @@ Return Value:
|
|||
this->PoolFloatKernel[MlasAveragePoolingIncludePad] = MlasPoolAverageIncludePadFloatKernelAvx512F;
|
||||
this->NchwcBlockSize = 16;
|
||||
this->PreferredBufferAlignment = 64;
|
||||
//
|
||||
// Check if the processor supports AVX512BW.
|
||||
//
|
||||
#if !defined(MLAS_AVX512BW_UNSUPPORTED)
|
||||
|
||||
if ((Cpuid7[1] & 0x40000000) != 0) {
|
||||
//
|
||||
// Check if the processor supports AVX512 core features
|
||||
// (AVX512BW/AVX512DQ/AVX512VL).
|
||||
//
|
||||
|
||||
this->GemmU8S8Kernel = MlasGemmU8S8KernelAvx512BW;
|
||||
this->GemvU8S8Kernel = MlasGemvU8S8KernelAvx512BW;
|
||||
this->GemmU8U8Kernel = MlasGemmU8U8KernelAvx512BW;
|
||||
#if !defined(MLAS_AVX512CORE_UNSUPPORTED)
|
||||
|
||||
if ((Cpuid7[1] & 0xC0020000) == 0xC0020000) {
|
||||
|
||||
this->GemmU8S8Kernel = MlasGemmU8S8KernelAvx512Core;
|
||||
this->GemvU8S8Kernel = MlasGemvU8S8KernelAvx512Core;
|
||||
this->GemmU8U8Kernel = MlasGemmU8U8KernelAvx512Core;
|
||||
|
||||
//
|
||||
// Check if the processor supports AVX512VNNI.
|
||||
|
|
@ -233,8 +236,11 @@ Return Value:
|
|||
this->GemmU8U8Kernel = MlasGemmU8U8KernelAvx512Vnni;
|
||||
}
|
||||
}
|
||||
#endif // MLAS_AVX512BW_UNSUPPORTED
|
||||
|
||||
#endif // MLAS_AVX512CORE_UNSUPPORTED
|
||||
|
||||
}
|
||||
|
||||
#endif // MLAS_AVX512F_UNSUPPORTED
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ Module Name:
|
|||
Abstract:
|
||||
|
||||
This module contains common kernel macros and structures for the quantized
|
||||
integer matrix/matrix multiply operation (QGEMM) for the AVX512BW and
|
||||
integer matrix/matrix multiply operation (QGEMM) for the AVX512 core and
|
||||
AVX512VNNI kernels.
|
||||
|
||||
--*/
|
||||
|
|
|
|||
|
|
@ -6,14 +6,14 @@ Licensed under the MIT License.
|
|||
|
||||
Module Name:
|
||||
|
||||
QgemmU8S8KernelAvx512BW.s
|
||||
QgemmU8S8KernelAvx512Core.s
|
||||
|
||||
Abstract:
|
||||
|
||||
This module implements the kernels for the quantized integer matrix/matrix
|
||||
multiply operation (QGEMM).
|
||||
|
||||
This implementation uses AVX512BW instructions.
|
||||
This implementation uses AVX512 core instructions (BW/DQ/VL).
|
||||
|
||||
--*/
|
||||
|
||||
|
|
@ -131,6 +131,6 @@ Implicit Arguments:
|
|||
// Generate the GEMM kernel.
|
||||
//
|
||||
|
||||
GemmU8X8KernelAvx512Function U8S8, Avx512BW
|
||||
GemmU8X8KernelAvx512Function U8S8, Avx512Core
|
||||
|
||||
.end
|
||||
|
|
@ -11,7 +11,7 @@ Module Name:
|
|||
Abstract:
|
||||
|
||||
This module contains common kernel macros and structures for the quantized
|
||||
integer matrix/matrix multiply operation (QGEMM) for the AVX512BW and
|
||||
integer matrix/matrix multiply operation (QGEMM) for the AVX512 core and
|
||||
AVX512VNNI kernels.
|
||||
|
||||
--*/
|
||||
|
|
|
|||
|
|
@ -6,14 +6,14 @@ Licensed under the MIT License.
|
|||
|
||||
Module Name:
|
||||
|
||||
QgemmU8U8KernelAvx512BW.s
|
||||
QgemmU8U8KernelAvx512Core.s
|
||||
|
||||
Abstract:
|
||||
|
||||
This module implements the kernels for the quantized integer matrix/matrix
|
||||
multiply operation (QGEMM).
|
||||
|
||||
This implementation uses AVX512BW instructions.
|
||||
This implementation uses AVX512 core instructions (BW/DQ/VL).
|
||||
|
||||
--*/
|
||||
|
||||
|
|
@ -128,6 +128,6 @@ Implicit Arguments:
|
|||
// Generate the GEMM kernel.
|
||||
//
|
||||
|
||||
GemmU8X8KernelAvx512Function U8U8, Avx512BW
|
||||
GemmU8X8KernelAvx512Function U8U8, Avx512Core
|
||||
|
||||
.end
|
||||
|
|
@ -11,7 +11,7 @@ Module Name:
|
|||
Abstract:
|
||||
|
||||
This module contains common kernel macros and structures for the quantized
|
||||
integer matrix/matrix multiply operation (QGEMM) for the AVX512BW and
|
||||
integer matrix/matrix multiply operation (QGEMM) for the AVX512 core and
|
||||
AVX512VNNI kernels.
|
||||
|
||||
--*/
|
||||
|
|
@ -343,7 +343,7 @@ C_UNDERSCORE(MlasGemm\Type\()Kernel\Isa\()):
|
|||
mov ebp,-1
|
||||
kmovw k1,ebp # update mask to write all columns
|
||||
.ifeqs "\Type\()", "U8S8"
|
||||
.ifeqs "\Isa\()", "Avx512BW"
|
||||
.ifeqs "\Isa\()", "Avx512Core"
|
||||
neg ebp
|
||||
vpbroadcastw zmm5,ebp # generate 512-bit word vector [0x0001]
|
||||
.endif
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ Module Name:
|
|||
Abstract:
|
||||
|
||||
This module contains common kernel macros and structures for the quantized
|
||||
integer matrix/vector multiply operation (QGEMV) for the AVX512BW and
|
||||
integer matrix/vector multiply operation (QGEMV) for the AVX512 core and
|
||||
AVX512VNNI kernels.
|
||||
|
||||
--*/
|
||||
|
|
@ -83,7 +83,7 @@ C_UNDERSCORE(MlasGemvU8S8Kernel\Isa\()):
|
|||
mov rcx,rbx
|
||||
mov r10,rdx
|
||||
mov r11,rsp # set ZeroMode to any non-zero value
|
||||
.ifeqs "\Isa\()", "Avx512BW"
|
||||
.ifeqs "\Isa\()", "Avx512Core"
|
||||
mov eax,1
|
||||
vpbroadcastw zmm29,eax
|
||||
.endif
|
||||
|
|
@ -126,7 +126,7 @@ C_UNDERSCORE(MlasGemvU8S8Kernel\Isa\()):
|
|||
vpunpckhwd zmm17,zmm20,zmm22
|
||||
vpunpcklwd zmm18,zmm21,zmm23
|
||||
vpunpckhwd zmm19,zmm21,zmm23
|
||||
.ifeqs "\Isa\()", "Avx512BW"
|
||||
.ifeqs "\Isa\()", "Avx512Core"
|
||||
vpmaddubsw zmm16,zmm28,zmm16
|
||||
vpmaddwd zmm20,zmm16,zmm29
|
||||
vpmaddubsw zmm17,zmm28,zmm17
|
||||
|
|
@ -234,7 +234,7 @@ C_UNDERSCORE(MlasGemvU8S8Kernel\Isa\()):
|
|||
vinserti128 ymm5,ymm5,xmm1,1 # concatenate 256-bit vector
|
||||
vinserti128 ymm3,ymm3,xmm2,1
|
||||
vshufi32x4 zmm16,zmm5,zmm3,0x44 # concatenate 512-bit vector
|
||||
.ifeqs "\Isa\()", "Avx512BW"
|
||||
.ifeqs "\Isa\()", "Avx512Core"
|
||||
vpmaddubsw zmm16,zmm28,zmm16
|
||||
vpmaddwd zmm20,zmm16,zmm29
|
||||
.else
|
||||
|
|
@ -323,7 +323,7 @@ C_UNDERSCORE(MlasGemvU8S8Kernel\Isa\()):
|
|||
vinserti128 ymm5,ymm5,xmm1,1 # concatenate 256-bit vector
|
||||
vinserti128 ymm3,ymm3,xmm2,1
|
||||
vshufi32x4 zmm16,zmm5,zmm3,0x44 # concatenate 512-bit vector
|
||||
.ifeqs "\Isa\()", "Avx512BW"
|
||||
.ifeqs "\Isa\()", "Avx512Core"
|
||||
vpmaddubsw zmm16,zmm28,zmm16
|
||||
vpmaddwd zmm20,zmm16,zmm29
|
||||
.else
|
||||
|
|
|
|||
|
|
@ -6,14 +6,14 @@ Licensed under the MIT License.
|
|||
|
||||
Module Name:
|
||||
|
||||
QgemvU8S8KernelAvx512BW.s
|
||||
QgemvU8S8KernelAvx512Core.s
|
||||
|
||||
Abstract:
|
||||
|
||||
This module implements the kernels for the quantized integer matrix/vector
|
||||
multiply operation (QGEMV).
|
||||
|
||||
This implementation uses AVX512BW instructions.
|
||||
This implementation uses AVX512 core instructions (BW/DQ/VL).
|
||||
|
||||
--*/
|
||||
|
||||
|
|
@ -28,6 +28,6 @@ Abstract:
|
|||
// Generate the GEMV kernel.
|
||||
//
|
||||
|
||||
GemvU8S8KernelAvx512Function Avx512BW
|
||||
GemvU8S8KernelAvx512Function Avx512Core
|
||||
|
||||
.end
|
||||
Loading…
Reference in a new issue