Add option to force generic algorithms on x86 (#22917)

Option is named onnxruntime_FORCE_GENERIC_ALGORITHMS Follow up to https://github.com/microsoft/onnxruntime/pull/22125. ### Description This change adds compile-time option to disable optimized algorithms and use generic algorithms (exclude AVX* and SSE etc in GEMM) on x86. This new option is intended only for testing these algorithms, not for production use. Following build command on linux x86_64 builds onnxruntime with new option enabled: `./build.sh --parallel --cmake_extra_defines onnxruntime_FORCE_GENERIC_ALGORITHMS=1` ### Motivation and Context This change allows testing generic algorithms. This may be needed for platforms which don't have optimized implementations available, like in https://github.com/microsoft/onnxruntime/pull/22125.
2026-05-14 20:48:00 +00:00 · 2024-11-21 22:45:46 +01:00 · 2024-11-21 22:45:46 +01:00 · f6e1d44829
commit f6e1d44829
parent 8d99b1a8dc
6 changed files with 50 additions and 3 deletions
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@ -252,6 +252,7 @@ cmake_dependent_option(MSVC_Z7_OVERRIDE "replacing /Zi and /ZI with /Z7 when usi

 option(onnxruntime_USE_AZURE "Build with azure inferencing support" OFF)
 option(onnxruntime_USE_LOCK_FREE_QUEUE "Build with lock-free task queue for threadpool." OFF)
+option(onnxruntime_FORCE_GENERIC_ALGORITHMS "Disable optimized arch-specific algorithms. Use only for testing and debugging generic algorithms." OFF)

 # ENABLE_TRAINING includes all training functionality
 # The following 2 entry points
@ -971,6 +972,10 @@ if (onnxruntime_USE_LOCK_FREE_QUEUE)
    add_compile_definitions(USE_LOCK_FREE_QUEUE)
 endif()

+if (onnxruntime_FORCE_GENERIC_ALGORITHMS)
+    add_compile_definitions(FORCE_GENERIC_ALGORITHMS)
+endif()
+
 if (onnxruntime_ENABLE_LAZY_TENSOR)
  # To support LazyTensor, ORT needs to call Python function from C/C++.
  # so onnxruntime_ENABLE_PYTHON is required.
--- a/cmake/onnxruntime_mlas.cmake
+++ b/cmake/onnxruntime_mlas.cmake
@ -679,6 +679,13 @@ endif()
    if(NOT ONNXRUNTIME_MLAS_MULTI_ARCH AND MLAS_SOURCE_IS_NOT_SET)
        file(GLOB_RECURSE mlas_platform_srcs
          "${MLAS_SRC_DIR}/scalar/*.cpp")
+    elseif (onnxruntime_FORCE_GENERIC_ALGORITHMS)
+        file(GLOB_RECURSE mlas_platform_srcs_generic
+          "${MLAS_SRC_DIR}/scalar/*.cpp")
+        set(mlas_platform_srcs
+            ${mlas_platform_srcs}
+            ${mlas_platform_srcs_generic}
+            )
    endif()
    target_sources(onnxruntime_mlas PRIVATE ${mlas_platform_srcs})
 endif()
--- a/onnxruntime/core/mlas/lib/mlasi.h
+++ b/onnxruntime/core/mlas/lib/mlasi.h
@ -358,6 +358,22 @@ size_t
    bool ZeroMode
    );

+#ifdef FORCE_GENERIC_ALGORITHMS
+typedef
+size_t
+(MLASCALL MLAS_GEMM_FLOAT_KERNEL_GENERIC)(
+    const float* A,
+    const float* B,
+    float* C,
+    size_t CountK,
+    size_t CountM,
+    size_t CountN,
+    size_t lda,
+    size_t ldc,
+    float alpha
+    );
+#endif
+
 #else

 #if defined(__aarch64__) && defined(__linux__)
@ -733,6 +749,10 @@ extern "C" {
 #if defined(MLAS_TARGET_AMD64_IX86)
    MLAS_GEMM_FLOAT_KERNEL MlasGemmFloatKernelSse;
    MLAS_GEMM_FLOAT_KERNEL MlasGemmFloatKernelAvx;
+#ifdef FORCE_GENERIC_ALGORITHMS
+    MLAS_GEMM_FLOAT_KERNEL_GENERIC MlasSgemmKernelZero;
+    MLAS_GEMM_FLOAT_KERNEL_GENERIC MlasSgemmKernelAdd;
+#endif
 #if defined(MLAS_TARGET_AMD64)
    MLAS_GEMM_FLOAT_KERNEL MlasGemmFloatKernelFma3;
    MLAS_GEMM_FLOAT_KERNEL MlasGemmFloatKernelAvx512F;
--- a/onnxruntime/core/mlas/lib/platform.cpp
+++ b/onnxruntime/core/mlas/lib/platform.cpp
@ -286,7 +286,11 @@ Return Value:
    this->QuantizeLinearS4Kernel = MlasQuantizeLinearS4Kernel;
    this->QuantizeLinearU4Kernel = MlasQuantizeLinearU4Kernel;
 #ifndef __APPLE__
+#ifndef FORCE_GENERIC_ALGORITHMS
    this->CastF16ToF32Kernel = &MlasCastF16ToF32KernelSse;
+#else  // FORCE_GENERIC_ALGORITHMS
+    this->CastF16ToF32Kernel = nullptr;
+#endif  // FORCE_GENERIC_ALGORITHMS
 #endif  // __APPLE__

    this->NchwcBlockSize = 8;
@ -308,8 +312,11 @@ Return Value:
    //
    // Check if the processor supports SSE 4.1 instructions.
    //
-
+#ifndef FORCE_GENERIC_ALGORITHMS
    if ((Cpuid1[2] & 0x80000) != 0) {
+#else  // FORCE_GENERIC_ALGORITHMS
+    if (false) {
+#endif  // FORCE_GENERIC_ALGORITHMS
        this->GemmU8S8Dispatch = &MlasGemmU8S8DispatchSse41;
    }

@ -319,7 +326,11 @@ Return Value:
    // Check if the processor supports the AVX and OSXSAVE features.
    //

+#ifndef FORCE_GENERIC_ALGORITHMS
    if ((Cpuid1[2] & 0x18000000) == 0x18000000) {
+#else  // FORCE_GENERIC_ALGORITHMS
+    if (false) {
+#endif  // FORCE_GENERIC_ALGORITHMS

        //
        // Check if the operating system supports saving SSE and AVX states.
--- a/onnxruntime/core/mlas/lib/qgemm.h
+++ b/onnxruntime/core/mlas/lib/qgemm.h
@ -867,6 +867,7 @@ MlasGemmQuantGetDispatch(
 {
    const MLAS_GEMM_QUANT_DISPATCH* GemmQuantDispatch = &MlasGemmQuantDispatchDefault;

+#if !defined(FORCE_GENERIC_ALGORITHMS)
 #if defined(MLAS_TARGET_AMD64_IX86)
    if (AIsSigned) {
        GemmQuantDispatch =
@ -901,6 +902,7 @@ MlasGemmQuantGetDispatch(
            BIsSigned ? GetMlasPlatform().GemmU8S8Dispatch : GetMlasPlatform().GemmU8U8Dispatch;
    }
 #endif
+#endif // !defined(FORCE_GENERIC_ALGORITHMS)

    if (nullptr == GemmQuantDispatch) {
        std::stringstream ss;
--- a/onnxruntime/core/mlas/lib/sgemm.cpp
+++ b/onnxruntime/core/mlas/lib/sgemm.cpp
@ -1061,7 +1061,7 @@ Return Value:

        size_t RowsHandled;

-#if defined(MLAS_TARGET_AMD64_IX86) || defined(MLAS_TARGET_POWER) || defined(MLAS_TARGET_LARCH64)
+#if (defined(MLAS_TARGET_AMD64_IX86) || defined(MLAS_TARGET_POWER) || defined(MLAS_TARGET_LARCH64)) && !defined(FORCE_GENERIC_ALGORITHMS)
        RowsHandled = GetMlasPlatform().GemmFloatKernel(A, B, C, CountK, CountM, CountN, lda, ldc, alpha, ZeroMode);
 #else
        if (ZeroMode) {
@ -1158,6 +1158,7 @@ Return Value:

    if (M == 1 && TransA == CblasNoTrans && alpha == 1.0f && (beta == 0.0f || beta == 1.0f)) {

+#if !defined(FORCE_GENERIC_ALGORITHMS)
 #if defined(MLAS_TARGET_AMD64)

        MLAS_SGEMM_KERNEL_M1_ROUTINE* SgemmKernelM1Routine;
@ -1181,6 +1182,7 @@ Return Value:
        }

 #endif
+#endif // !defined(FORCE_GENERIC_ALGORITHMS)

    }

@ -1193,7 +1195,7 @@ Return Value:

    if (N == 1 && ldb == 1 && ldc == 1 && alpha == 1.0f && (beta == 0.0f || beta == 1.0f)) {

-#if defined(MLAS_TARGET_AMD64)
+#if defined(MLAS_TARGET_AMD64) && !defined(FORCE_GENERIC_ALGORITHMS)

        MLAS_SGEMM_KERNEL_M1_ROUTINE* SgemmKernelM1Routine;