mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-14 20:48:00 +00:00
Add option to force generic algorithms on x86 (#22917)
Option is named onnxruntime_FORCE_GENERIC_ALGORITHMS Follow up to https://github.com/microsoft/onnxruntime/pull/22125. ### Description This change adds compile-time option to disable optimized algorithms and use generic algorithms (exclude AVX* and SSE etc in GEMM) on x86. This new option is intended only for testing these algorithms, not for production use. Following build command on linux x86_64 builds onnxruntime with new option enabled: `./build.sh --parallel --cmake_extra_defines onnxruntime_FORCE_GENERIC_ALGORITHMS=1` ### Motivation and Context This change allows testing generic algorithms. This may be needed for platforms which don't have optimized implementations available, like in https://github.com/microsoft/onnxruntime/pull/22125.
This commit is contained in:
parent
8d99b1a8dc
commit
f6e1d44829
6 changed files with 50 additions and 3 deletions
|
|
@ -252,6 +252,7 @@ cmake_dependent_option(MSVC_Z7_OVERRIDE "replacing /Zi and /ZI with /Z7 when usi
|
|||
|
||||
option(onnxruntime_USE_AZURE "Build with azure inferencing support" OFF)
|
||||
option(onnxruntime_USE_LOCK_FREE_QUEUE "Build with lock-free task queue for threadpool." OFF)
|
||||
option(onnxruntime_FORCE_GENERIC_ALGORITHMS "Disable optimized arch-specific algorithms. Use only for testing and debugging generic algorithms." OFF)
|
||||
|
||||
# ENABLE_TRAINING includes all training functionality
|
||||
# The following 2 entry points
|
||||
|
|
@ -971,6 +972,10 @@ if (onnxruntime_USE_LOCK_FREE_QUEUE)
|
|||
add_compile_definitions(USE_LOCK_FREE_QUEUE)
|
||||
endif()
|
||||
|
||||
if (onnxruntime_FORCE_GENERIC_ALGORITHMS)
|
||||
add_compile_definitions(FORCE_GENERIC_ALGORITHMS)
|
||||
endif()
|
||||
|
||||
if (onnxruntime_ENABLE_LAZY_TENSOR)
|
||||
# To support LazyTensor, ORT needs to call Python function from C/C++.
|
||||
# so onnxruntime_ENABLE_PYTHON is required.
|
||||
|
|
|
|||
|
|
@ -679,6 +679,13 @@ endif()
|
|||
if(NOT ONNXRUNTIME_MLAS_MULTI_ARCH AND MLAS_SOURCE_IS_NOT_SET)
|
||||
file(GLOB_RECURSE mlas_platform_srcs
|
||||
"${MLAS_SRC_DIR}/scalar/*.cpp")
|
||||
elseif (onnxruntime_FORCE_GENERIC_ALGORITHMS)
|
||||
file(GLOB_RECURSE mlas_platform_srcs_generic
|
||||
"${MLAS_SRC_DIR}/scalar/*.cpp")
|
||||
set(mlas_platform_srcs
|
||||
${mlas_platform_srcs}
|
||||
${mlas_platform_srcs_generic}
|
||||
)
|
||||
endif()
|
||||
target_sources(onnxruntime_mlas PRIVATE ${mlas_platform_srcs})
|
||||
endif()
|
||||
|
|
|
|||
|
|
@ -358,6 +358,22 @@ size_t
|
|||
bool ZeroMode
|
||||
);
|
||||
|
||||
#ifdef FORCE_GENERIC_ALGORITHMS
|
||||
typedef
|
||||
size_t
|
||||
(MLASCALL MLAS_GEMM_FLOAT_KERNEL_GENERIC)(
|
||||
const float* A,
|
||||
const float* B,
|
||||
float* C,
|
||||
size_t CountK,
|
||||
size_t CountM,
|
||||
size_t CountN,
|
||||
size_t lda,
|
||||
size_t ldc,
|
||||
float alpha
|
||||
);
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#if defined(__aarch64__) && defined(__linux__)
|
||||
|
|
@ -733,6 +749,10 @@ extern "C" {
|
|||
#if defined(MLAS_TARGET_AMD64_IX86)
|
||||
MLAS_GEMM_FLOAT_KERNEL MlasGemmFloatKernelSse;
|
||||
MLAS_GEMM_FLOAT_KERNEL MlasGemmFloatKernelAvx;
|
||||
#ifdef FORCE_GENERIC_ALGORITHMS
|
||||
MLAS_GEMM_FLOAT_KERNEL_GENERIC MlasSgemmKernelZero;
|
||||
MLAS_GEMM_FLOAT_KERNEL_GENERIC MlasSgemmKernelAdd;
|
||||
#endif
|
||||
#if defined(MLAS_TARGET_AMD64)
|
||||
MLAS_GEMM_FLOAT_KERNEL MlasGemmFloatKernelFma3;
|
||||
MLAS_GEMM_FLOAT_KERNEL MlasGemmFloatKernelAvx512F;
|
||||
|
|
|
|||
|
|
@ -286,7 +286,11 @@ Return Value:
|
|||
this->QuantizeLinearS4Kernel = MlasQuantizeLinearS4Kernel;
|
||||
this->QuantizeLinearU4Kernel = MlasQuantizeLinearU4Kernel;
|
||||
#ifndef __APPLE__
|
||||
#ifndef FORCE_GENERIC_ALGORITHMS
|
||||
this->CastF16ToF32Kernel = &MlasCastF16ToF32KernelSse;
|
||||
#else // FORCE_GENERIC_ALGORITHMS
|
||||
this->CastF16ToF32Kernel = nullptr;
|
||||
#endif // FORCE_GENERIC_ALGORITHMS
|
||||
#endif // __APPLE__
|
||||
|
||||
this->NchwcBlockSize = 8;
|
||||
|
|
@ -308,8 +312,11 @@ Return Value:
|
|||
//
|
||||
// Check if the processor supports SSE 4.1 instructions.
|
||||
//
|
||||
|
||||
#ifndef FORCE_GENERIC_ALGORITHMS
|
||||
if ((Cpuid1[2] & 0x80000) != 0) {
|
||||
#else // FORCE_GENERIC_ALGORITHMS
|
||||
if (false) {
|
||||
#endif // FORCE_GENERIC_ALGORITHMS
|
||||
this->GemmU8S8Dispatch = &MlasGemmU8S8DispatchSse41;
|
||||
}
|
||||
|
||||
|
|
@ -319,7 +326,11 @@ Return Value:
|
|||
// Check if the processor supports the AVX and OSXSAVE features.
|
||||
//
|
||||
|
||||
#ifndef FORCE_GENERIC_ALGORITHMS
|
||||
if ((Cpuid1[2] & 0x18000000) == 0x18000000) {
|
||||
#else // FORCE_GENERIC_ALGORITHMS
|
||||
if (false) {
|
||||
#endif // FORCE_GENERIC_ALGORITHMS
|
||||
|
||||
//
|
||||
// Check if the operating system supports saving SSE and AVX states.
|
||||
|
|
|
|||
|
|
@ -867,6 +867,7 @@ MlasGemmQuantGetDispatch(
|
|||
{
|
||||
const MLAS_GEMM_QUANT_DISPATCH* GemmQuantDispatch = &MlasGemmQuantDispatchDefault;
|
||||
|
||||
#if !defined(FORCE_GENERIC_ALGORITHMS)
|
||||
#if defined(MLAS_TARGET_AMD64_IX86)
|
||||
if (AIsSigned) {
|
||||
GemmQuantDispatch =
|
||||
|
|
@ -901,6 +902,7 @@ MlasGemmQuantGetDispatch(
|
|||
BIsSigned ? GetMlasPlatform().GemmU8S8Dispatch : GetMlasPlatform().GemmU8U8Dispatch;
|
||||
}
|
||||
#endif
|
||||
#endif // !defined(FORCE_GENERIC_ALGORITHMS)
|
||||
|
||||
if (nullptr == GemmQuantDispatch) {
|
||||
std::stringstream ss;
|
||||
|
|
|
|||
|
|
@ -1061,7 +1061,7 @@ Return Value:
|
|||
|
||||
size_t RowsHandled;
|
||||
|
||||
#if defined(MLAS_TARGET_AMD64_IX86) || defined(MLAS_TARGET_POWER) || defined(MLAS_TARGET_LARCH64)
|
||||
#if (defined(MLAS_TARGET_AMD64_IX86) || defined(MLAS_TARGET_POWER) || defined(MLAS_TARGET_LARCH64)) && !defined(FORCE_GENERIC_ALGORITHMS)
|
||||
RowsHandled = GetMlasPlatform().GemmFloatKernel(A, B, C, CountK, CountM, CountN, lda, ldc, alpha, ZeroMode);
|
||||
#else
|
||||
if (ZeroMode) {
|
||||
|
|
@ -1158,6 +1158,7 @@ Return Value:
|
|||
|
||||
if (M == 1 && TransA == CblasNoTrans && alpha == 1.0f && (beta == 0.0f || beta == 1.0f)) {
|
||||
|
||||
#if !defined(FORCE_GENERIC_ALGORITHMS)
|
||||
#if defined(MLAS_TARGET_AMD64)
|
||||
|
||||
MLAS_SGEMM_KERNEL_M1_ROUTINE* SgemmKernelM1Routine;
|
||||
|
|
@ -1181,6 +1182,7 @@ Return Value:
|
|||
}
|
||||
|
||||
#endif
|
||||
#endif // !defined(FORCE_GENERIC_ALGORITHMS)
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -1193,7 +1195,7 @@ Return Value:
|
|||
|
||||
if (N == 1 && ldb == 1 && ldc == 1 && alpha == 1.0f && (beta == 0.0f || beta == 1.0f)) {
|
||||
|
||||
#if defined(MLAS_TARGET_AMD64)
|
||||
#if defined(MLAS_TARGET_AMD64) && !defined(FORCE_GENERIC_ALGORITHMS)
|
||||
|
||||
MLAS_SGEMM_KERNEL_M1_ROUTINE* SgemmKernelM1Routine;
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue