onnxruntime/onnxruntime/core/mlas/lib/platform.cpp
RajalakshmiSR 5d8c5409ab
POWER10: QGEMM optimization (#10642)
* POWER10: QGEMM optimization

This patch makes use of POWER10 MMA feature for QGEMM function.
This optimization includes signed and unsigned cases.Tested and
there are no new failures with gcc11 and clang-14.

* Changes as per review comments

Co-authored-by: Rajalakshmi Srinivasaraghavan <rajis@linux.ibm.com>
2022-03-02 08:36:26 -08:00

439 lines
15 KiB
C++

/*++
Copyright (c) Microsoft Corporation. All rights reserved.
Licensed under the MIT License.
Module Name:
platform.cpp
Abstract:
This module implements logic to select the best configuration for the
this platform.
--*/
#include "mlasi.h"
#include <thread>
#include <mutex>
#if defined(MLAS_TARGET_POWER) && defined(__linux__)
#include <sys/auxv.h>
#endif
#if defined(MLAS_TARGET_ARM64)
#if defined(_WIN32)
// N.B. Support building with downlevel versions of the Windows SDK.
#ifndef PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE
#define PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE 43
#endif
#elif defined(__linux__)
#include <sys/auxv.h>
#include <asm/hwcap.h>
// N.B. Support building with older versions of asm/hwcap.h that do not define
// this capability bit.
#ifndef HWCAP_ASIMDDP
#define HWCAP_ASIMDDP (1 << 20)
#endif
#if defined(BUILD_MLAS_NO_ONNXRUNTIME)
MLASCPUIDInfo::MLASCPUIDInfo() { has_arm_neon_dot_ = ((getauxval(AT_HWCAP) & HWCAP_ASIMDDP) != 0); }
#endif
#endif
#endif // MLAS_TARGET_ARM64
#ifdef MLAS_TARGET_AMD64_IX86
//
// Stores a vector to build a conditional load/store mask for vmaskmovps.
//
MLAS_INTERNAL_DATA MLAS_DECLSPEC_ALIGN(const uint32_t MlasMaskMoveAvx[8], 32) = { 0, 1, 2, 3, 4, 5, 6, 7 };
//
// Stores a table of AVX vmaskmovps/vmaskmovpd load/store masks.
//
MLAS_INTERNAL_DATA MLAS_DECLSPEC_ALIGN(const uint32_t MlasMaskMoveTableAvx[16], 32) = {
0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
};
//
// Stores a table of AVX512 opmask register values.
//
MLAS_INTERNAL_DATA MLAS_DECLSPEC_ALIGN(const int16_t MlasOpmask16BitTableAvx512[16], 32) = {
0x0000, 0x0001, 0x0003, 0x0007, 0x000F, 0x001F, 0x003F, 0x007F,
0x00FF, 0x01FF, 0x03FF, 0x07FF, 0x0FFF, 0x1FFF, 0x3FFF, 0x7FFF,
};
//
// Reads the processor extended control register to determine platform
// capabilities.
//
#if !defined(_XCR_XFEATURE_ENABLED_MASK)
#define _XCR_XFEATURE_ENABLED_MASK 0
#endif
inline
uint64_t
MlasReadExtendedControlRegister(
unsigned int ext_ctrl_reg
)
{
#if defined(_WIN32)
return _xgetbv(ext_ctrl_reg);
#else
uint32_t eax, edx;
__asm__
(
"xgetbv"
: "=a" (eax), "=d" (edx)
: "c" (ext_ctrl_reg)
);
return ((uint64_t)edx << 32) | eax;
#endif
}
#endif
MLAS_PLATFORM::MLAS_PLATFORM(
void
)
/*++
Routine Description:
This routine initializes the platform support for this library.
Arguments:
None.
Return Value:
None.
--*/
{
this->ConvDepthwiseU8S8Kernel = MlasConvDepthwiseKernel<uint8_t, int8_t>;
this->ConvDepthwiseU8U8Kernel = MlasConvDepthwiseKernel<uint8_t, uint8_t>;
this->ConvDepthwiseS8S8Kernel = MlasConvDepthwiseKernel<int8_t, int8_t>;
this->ConvDepthwiseS8U8Kernel = MlasConvDepthwiseKernel<int8_t, uint8_t>;
#if defined(MLAS_TARGET_AMD64_IX86)
//
// Default to the baseline SSE2 support.
//
this->GemmFloatKernel = MlasGemmFloatKernelSse;
this->GemmU8S8Dispatch = &MlasGemmU8X8DispatchSse;
this->GemmU8U8Dispatch = &MlasGemmU8X8DispatchSse;
#if defined(MLAS_TARGET_AMD64)
this->TransposePackB16x4Routine = MlasSgemmTransposePackB16x4Sse;
this->GemmDoubleKernel = MlasGemmDoubleKernelSse;
this->ConvNchwFloatKernel = MlasConvNchwFloatKernelSse;
this->ConvNchwcFloatKernel = MlasConvNchwcFloatKernelSse;
this->ConvDepthwiseFloatKernel = MlasConvDepthwiseFloatKernelSse;
this->ConvPointwiseFloatKernel = MlasConvPointwiseFloatKernelSse;
this->PoolFloatKernel[MlasMaximumPooling] = MlasPoolMaximumFloatKernelSse;
this->PoolFloatKernel[MlasAveragePoolingExcludePad] = MlasPoolAverageExcludePadFloatKernelSse;
this->PoolFloatKernel[MlasAveragePoolingIncludePad] = MlasPoolAverageIncludePadFloatKernelSse;
this->ComputeExpF32Kernel = MlasComputeExpF32Kernel;
this->LogisticKernelRoutine = MlasLogisticKernel;
this->TanhKernelRoutine = MlasTanhKernel;
this->ErfKernelRoutine = MlasErfKernel;
this->ComputeSumExpF32Kernel = MlasComputeSumExpF32Kernel;
this->ComputeSoftmaxOutputF32Kernel = MlasComputeSoftmaxOutputF32Kernel;
this->ComputeLogSoftmaxOutputF32Kernel = MlasComputeLogSoftmaxOutputF32Kernel;
this->ReduceMaximumF32Kernel = MlasReduceMaximumF32Kernel;
this->ReduceMinimumMaximumF32Kernel = MlasReduceMinimumMaximumF32Kernel;
this->QLinearAddS8Kernel = MlasQLinearAddS8Kernel;
this->QLinearAddU8Kernel = MlasQLinearAddU8Kernel;
this->QuantizeLinearS8Kernel = MlasQuantizeLinearS8Kernel;
this->QuantizeLinearU8Kernel = MlasQuantizeLinearU8Kernel;
this->NchwcBlockSize = 8;
this->PreferredBufferAlignment = MLAS_DEFAULT_PREFERRED_BUFFER_ALIGNMENT;
this->MaximumThreadCount = MLAS_MAXIMUM_THREAD_COUNT;
#endif
unsigned Cpuid1[4];
#if defined(_WIN32)
__cpuid((int*)Cpuid1, 1);
#else
__cpuid(1, Cpuid1[0], Cpuid1[1], Cpuid1[2], Cpuid1[3]);
#endif
#if defined(_MSC_VER)
//
// Check if the processor supports SSE 4.1 instructions.
//
if ((Cpuid1[2] & 0x80000) != 0) {
this->GemmU8S8Dispatch = &MlasGemmU8S8DispatchSse41;
}
#endif
//
// Check if the processor supports the AVX and OSXSAVE features.
//
if ((Cpuid1[2] & 0x18000000) == 0x18000000) {
//
// Check if the operating system supports saving SSE and AVX states.
//
uint64_t xcr0 = MlasReadExtendedControlRegister(_XCR_XFEATURE_ENABLED_MASK);
if ((xcr0 & 0x6) == 0x6) {
this->GemmFloatKernel = MlasGemmFloatKernelAvx;
#if defined(MLAS_TARGET_AMD64)
this->KernelM1Routine = MlasSgemmKernelM1Avx;
this->KernelM1TransposeBRoutine = MlasSgemmKernelM1TransposeBAvx;
this->TransposePackB16x4Routine = MlasSgemmTransposePackB16x4Avx;
this->GemmDoubleKernel = MlasGemmDoubleKernelAvx;
this->ConvNchwFloatKernel = MlasConvNchwFloatKernelAvx;
this->ConvNchwcFloatKernel = MlasConvNchwcFloatKernelAvx;
this->ConvDepthwiseFloatKernel = MlasConvDepthwiseFloatKernelAvx;
this->ConvPointwiseFloatKernel = MlasConvPointwiseFloatKernelAvx;
this->PoolFloatKernel[MlasMaximumPooling] = MlasPoolMaximumFloatKernelAvx;
this->PoolFloatKernel[MlasAveragePoolingExcludePad] = MlasPoolAverageExcludePadFloatKernelAvx;
this->PoolFloatKernel[MlasAveragePoolingIncludePad] = MlasPoolAverageIncludePadFloatKernelAvx;
this->ComputeSoftmaxOutputF32Kernel = MlasComputeSoftmaxOutputF32KernelAvx;
this->ComputeLogSoftmaxOutputF32Kernel = MlasComputeLogSoftmaxOutputF32KernelAvx;
this->ReduceMaximumF32Kernel = MlasReduceMaximumF32KernelAvx;
this->ReduceMinimumMaximumF32Kernel = MlasReduceMinimumMaximumF32KernelAvx;
//
// Check if the processor supports AVX2/FMA3 features.
//
unsigned Cpuid7[4];
#if defined(_WIN32)
__cpuidex((int*)Cpuid7, 7, 0);
#else
__cpuid_count(7, 0, Cpuid7[0], Cpuid7[1], Cpuid7[2], Cpuid7[3]);
#endif
if (((Cpuid1[2] & 0x1000) != 0) && ((Cpuid7[1] & 0x20) != 0)) {
this->GemmU8S8Dispatch = &MlasGemmU8S8DispatchAvx2;
this->GemmU8S8Kernel = MlasGemmU8S8KernelAvx2;
this->GemvU8S8Kernel = MlasGemvU8S8KernelAvx2;
this->GemmU8U8Dispatch = &MlasGemmU8U8DispatchAvx2;
this->GemmU8U8Kernel = MlasGemmU8U8KernelAvx2;
this->ConvSymU8S8Dispatch = &MlasConvSymDispatchAvx2;
this->GemmFloatKernel = MlasGemmFloatKernelFma3;
this->GemmDoubleKernel = MlasGemmDoubleKernelFma3;
this->ConvNchwFloatKernel = MlasConvNchwFloatKernelFma3;
this->ConvNchwcFloatKernel = MlasConvNchwcFloatKernelFma3;
this->ConvDepthwiseFloatKernel = MlasConvDepthwiseFloatKernelFma3;
this->ConvPointwiseFloatKernel = MlasConvPointwiseFloatKernelFma3;
this->ComputeExpF32Kernel = MlasComputeExpF32KernelFma3;
this->LogisticKernelRoutine = MlasComputeLogisticF32KernelFma3;
this->TanhKernelRoutine = MlasComputeTanhF32KernelFma3;
this->ErfKernelRoutine = MlasErfKernelFma3;
this->QLinearAddS8Kernel = MlasQLinearAddS8KernelAvx2;
this->QLinearAddU8Kernel = MlasQLinearAddU8KernelAvx2;
this->ConvDepthwiseU8S8Kernel = MlasConvDepthwiseKernelAvx2<uint8_t, int8_t>;
this->ConvDepthwiseU8U8Kernel = MlasConvDepthwiseKernelAvx2<uint8_t, uint8_t>;
this->ConvDepthwiseS8S8Kernel = MlasConvDepthwiseKernelAvx2<int8_t, int8_t>;
this->ConvDepthwiseS8U8Kernel = MlasConvDepthwiseKernelAvx2<int8_t, uint8_t>;
this->ComputeSumExpF32Kernel = MlasComputeSumExpF32KernelFma3;
//
// Check if the processor supports Hybrid core architecture.
//
if ((Cpuid7[3] & 0x8000) != 0) {
this->MaximumThreadCount = MLAS_MAXIMUM_THREAD_COUNT * 4;
}
//
// Check if the processor supports AVXVNNI features.
//
unsigned Cpuid7_1[4];
#if defined(_WIN32)
__cpuidex((int*)Cpuid7_1, 7, 1);
#else
__cpuid_count(7, 1, Cpuid7_1[0], Cpuid7_1[1], Cpuid7_1[2], Cpuid7_1[3]);
#endif
if ((Cpuid7_1[0] & 0x10) != 0) {
this->GemmU8U8Dispatch = &MlasGemmU8S8DispatchAvx2;
this->GemmU8S8Kernel = MlasGemmU8S8KernelAvxVnni;
this->GemvU8S8Kernel = MlasGemvU8S8KernelAvxVnni;
this->ConvSymU8S8Dispatch = &MlasConvSymDispatchAvxVnni;
}
#if !defined(ORT_MINIMAL_BUILD)
//
// Check if the processor supports AVX512F features and the
// operating system supports saving AVX512F state.
//
if (((Cpuid7[1] & 0x10000) != 0) && ((xcr0 & 0xE0) == 0xE0)) {
this->GemmFloatKernel = MlasGemmFloatKernelAvx512F;
this->GemmDoubleKernel = MlasGemmDoubleKernelAvx512F;
this->ConvNchwFloatKernel = MlasConvNchwFloatKernelAvx512F;
this->ConvNchwcFloatKernel = MlasConvNchwcFloatKernelAvx512F;
this->ConvDepthwiseFloatKernel = MlasConvDepthwiseFloatKernelAvx512F;
this->ConvPointwiseFloatKernel = MlasConvPointwiseFloatKernelAvx512F;
this->PoolFloatKernel[MlasMaximumPooling] = MlasPoolMaximumFloatKernelAvx512F;
this->PoolFloatKernel[MlasAveragePoolingExcludePad] = MlasPoolAverageExcludePadFloatKernelAvx512F;
this->PoolFloatKernel[MlasAveragePoolingIncludePad] = MlasPoolAverageIncludePadFloatKernelAvx512F;
this->ComputeExpF32Kernel = MlasComputeExpF32KernelAvx512F;
this->ComputeSumExpF32Kernel = MlasComputeSumExpF32KernelAvx512F;
this->QuantizeLinearS8Kernel = MlasQuantizeLinearS8KernelAvx512F;
this->QuantizeLinearU8Kernel = MlasQuantizeLinearU8KernelAvx512F;
this->NchwcBlockSize = 16;
this->PreferredBufferAlignment = 64;
//
// Check if the processor supports AVX512 core features
// (AVX512BW/AVX512DQ/AVX512VL).
//
if ((Cpuid7[1] & 0xC0020000) == 0xC0020000) {
this->GemmU8S8Kernel = MlasGemmU8S8KernelAvx512Core;
this->GemvU8S8Kernel = MlasGemvU8S8KernelAvx512Core;
this->GemmU8U8Kernel = MlasGemmU8U8KernelAvx512Core;
this->ConvSymU8S8Dispatch = &MlasConvSymDispatchAvx512Core;
//
// Check if the processor supports AVX512VNNI.
//
if ((Cpuid7[2] & 0x800) != 0) {
this->GemmU8U8Dispatch = &MlasGemmU8S8DispatchAvx2;
this->GemmU8S8Kernel = MlasGemmU8S8KernelAvx512Vnni;
this->GemvU8S8Kernel = MlasGemvU8S8KernelAvx512Vnni;
this->ConvSymU8S8Dispatch = &MlasConvSymDispatchAvx512Vnni;
}
}
}
#endif // ORT_MINIMAL_BUILD
}
#endif // MLAS_TARGET_AMD64
}
}
#endif // MLAS_TARGET_AMD64_IX86
#if defined(MLAS_TARGET_ARM64)
this->GemmU8X8Dispatch = &MlasGemmU8X8DispatchNeon;
this->SymmQgemmDispatch = &MlasSymmQgemmS8DispatchNeon;
this->ConvSymU8S8Dispatch = &MlasConvSymU8DispatchNeon;
this->ConvSymS8S8Dispatch = &MlasConvSymS8DispatchNeon;
//
// Check if the processor supports ASIMD dot product instructions.
//
bool HasDotProductInstructions;
#if defined(_WIN32)
HasDotProductInstructions = (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE) != 0);
#elif defined(__linux__)
HasDotProductInstructions = MLAS_CPUIDINFO::GetCPUIDInfo().HasArmNeonDot();
#else
HasDotProductInstructions = false;
#endif
if (HasDotProductInstructions) {
this->GemmU8X8Dispatch = &MlasGemmU8X8DispatchUdot;
this->SymmQgemmDispatch = &MlasSymmQgemmS8DispatchSdot;
this->ConvSymU8S8Dispatch = &MlasConvSymU8DispatchDot;
this->ConvSymS8S8Dispatch = &MlasConvSymS8DispatchDot;
}
#endif // MLAS_TARGET_ARM64
#if defined(MLAS_TARGET_POWER)
this->GemmFloatKernel = MlasSgemmKernel;
this->GemmDoubleKernel = MlasDgemmKernel;
#if defined(__linux__) && defined(POWER10)
#if (defined(__GNUC__) && ((__GNUC__ > 10) || (__GNUC__== 10 && __GNUC_MINOR__ >= 2))) || \
(defined(__clang__) && (__clang_major__ >= 12))
unsigned long hwcap2 = getauxval(AT_HWCAP2);
bool HasP10Instructions = ((hwcap2 & PPC_FEATURE2_MMA) && (hwcap2 & PPC_FEATURE2_ARCH_3_1));
if (HasP10Instructions) {
this->GemmFloatKernel = MlasSgemmKernelPOWER10;
this->GemmDoubleKernel = MlasDgemmKernelPOWER10;
this->GemmU8X8Dispatch = &MlasGemm8X8DispatchPOWER10;
}
#endif
#endif
#endif
// Init the table describing the type (big or litte) of each core
#if defined(MLAS_TARGET_ARM64) && defined(__linux__)
// TODO!! implemente core uarch detection in Windows
auto tbl_size = std::thread::hardware_concurrency();
if (tbl_size > 0) {
mlas_coretype_tbl.resize(tbl_size, mlas_core_unknown);
}
#endif
}
size_t
MLASCALL
MlasGetPreferredBufferAlignment(
void
)
/*++
Routine Description:
This routine returns the preferred byte alignment for buffers that are used
with this library. Buffers that are not byte aligned to this value will
function, but will not achieve best performance.
Arguments:
None.
Return Value:
Returns the preferred byte alignment for buffers.
--*/
{
#if defined(MLAS_TARGET_AMD64)
return GetMlasPlatform().PreferredBufferAlignment;
#else
return MLAS_DEFAULT_PREFERRED_BUFFER_ALIGNMENT;
#endif
}