[VSINPU] update crosscompiling patch (#22937)

### Description  Update this patch because the origin file has changed ### Motivation and Context
2026-07-17 18:40:28 +00:00 · 2024-11-27 06:35:16 +08:00 · 2024-11-27 06:35:16 +08:00 · 487184fa42
commit 487184fa42
parent 8826e39a81
1 changed files with 21 additions and 35 deletions
--- a/onnxruntime/core/providers/vsinpu/patches/mlas_crosscompiling.patch
+++ b/onnxruntime/core/providers/vsinpu/patches/mlas_crosscompiling.patch
@ -1,8 +1,8 @@
 diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
-index c02ac2096d..2bc51298f0 100644
+index 10c307b3b9..a52bf71c4d 100644
 --- a/cmake/onnxruntime_mlas.cmake
 +++ b/cmake/onnxruntime_mlas.cmake
-@@ -361,7 +361,7 @@ else()
+@@ -370,7 +370,7 @@ else()
         )
         set_source_files_properties(${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8.cpp
                                     PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+dotprod")
@ -12,10 +12,10 @@ index c02ac2096d..2bc51298f0 100644
             ${mlas_platform_srcs}
             ${MLAS_SRC_DIR}/aarch64/HalfGemmKernelNeon.S
 diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h
-index e46105324a..414c46a1ce 100644
+index 28ae64c4d5..0c77e0ca78 100644
 --- a/onnxruntime/core/mlas/inc/mlas.h
 +++ b/onnxruntime/core/mlas/inc/mlas.h
-@@ -82,6 +82,9 @@ Abstract:
+@@ -83,6 +83,9 @@ Abstract:

 #if (!defined(_MSC_VER)) || (_MSC_VER >= 1930)
 #if defined(MLAS_TARGET_ARM64) || defined(MLAS_TARGET_ARM64EC)
@ -25,7 +25,7 @@ index e46105324a..414c46a1ce 100644
 #if !defined(__APPLE__)
 // Had to temporary disable fp16 under APPLE ARM64, as compiling
 // the source files require a hardware specific compilation flag.
-@@ -90,6 +93,7 @@ Abstract:
+@@ -91,6 +94,7 @@ Abstract:

 #define MLAS_F16VEC_INTRINSICS_SUPPORTED

@ -33,7 +33,7 @@ index e46105324a..414c46a1ce 100644
 #endif //
 #endif // ARM64
 #endif // Visual Studio 16 or earlier does not support fp16 intrinsic
-@@ -1635,6 +1639,7 @@ MlasHalfGemmConvertPackB(
+@@ -1644,6 +1648,7 @@ MlasHalfGemmConvertPackB(
     );

 #if defined(__aarch64__) && defined(__linux__)
@ -41,7 +41,7 @@ index e46105324a..414c46a1ce 100644
 /**
  * @brief Whether current CPU supports Bfloat16(bf16) acceleration.
  */
-@@ -1746,6 +1751,7 @@ MlasSBGemmPackBSize(size_t N, size_t K);
+@@ -1755,6 +1760,7 @@ MlasSBGemmPackBSize(size_t N, size_t K);
 void MLASCALL
 MlasSBGemmConvertPackB(size_t N, size_t K, const float* B, size_t ldb, void* PackedB);
 #endif
@ -50,10 +50,10 @@ index e46105324a..414c46a1ce 100644
 /**
  * @brief Indirect Depthwise convolution for fp16
 diff --git a/onnxruntime/core/mlas/lib/mlasi.h b/onnxruntime/core/mlas/lib/mlasi.h
-index 4239e2ecae..3df7e5573d 100644
+index 0533a5e49b..c18bf7f90d 100644
 --- a/onnxruntime/core/mlas/lib/mlasi.h
 +++ b/onnxruntime/core/mlas/lib/mlasi.h
-@@ -361,6 +361,7 @@ size_t
+@@ -377,6 +377,7 @@ size_t
 #else

 #if defined(__aarch64__) && defined(__linux__)
@ -61,7 +61,7 @@ index 4239e2ecae..3df7e5573d 100644
 typedef size_t(MLASCALL MLAS_SBGEMM_FLOAT_KERNEL)(
     const float* A,
     const bfloat16_t* B,
-@@ -373,6 +374,7 @@ typedef size_t(MLASCALL MLAS_SBGEMM_FLOAT_KERNEL)(
+@@ -389,6 +390,7 @@ typedef size_t(MLASCALL MLAS_SBGEMM_FLOAT_KERNEL)(
     const float* Bias
 );
 #endif
@ -69,7 +69,7 @@ index 4239e2ecae..3df7e5573d 100644

 typedef
 size_t
-@@ -763,8 +765,10 @@ extern "C" {
+@@ -796,8 +798,10 @@ extern "C" {
     MLAS_GEMM_FLOAT_KERNEL MlasSgemmKernelZero;
     MLAS_GEMM_FLOAT_KERNEL MlasSgemmKernelAdd;
 #if defined(__aarch64__) && defined(__linux__)
@ -80,7 +80,7 @@ index 4239e2ecae..3df7e5573d 100644
 #endif
     MLAS_GEMM_DOUBLE_KERNEL MlasDgemmKernelZero;
     MLAS_GEMM_DOUBLE_KERNEL MlasDgemmKernelAdd;
-@@ -899,8 +903,10 @@ extern "C" {
+@@ -946,8 +950,10 @@ extern "C" {
 #define MLAS_QGEMM_THREAD_COMPLEXITY                65536

 #if defined(__aarch64__) && defined(__linux__)
@ -91,26 +91,12 @@ index 4239e2ecae..3df7e5573d 100644

 //
 // Single-threaded single precision matrix/matrix multiply operation.
-@@ -2570,4 +2576,3 @@ MlasPackInt4Elements(uint8_t* Output, UnpackedType ValueLow, UnpackedType ValueH
-     static_assert(std::is_same_v<UnpackedType, uint8_t> || std::is_same_v<UnpackedType, int8_t>);
-     *Output = static_cast<uint8_t>(((ValueHigh & 0xF) << 4) | (ValueLow & 0xF));
- }
-
 diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp
-index ed437f20f7..8c9d0a75fd 100644
+index b3c9461293..424c3b0441 100644
 --- a/onnxruntime/core/mlas/lib/platform.cpp
 +++ b/onnxruntime/core/mlas/lib/platform.cpp
-@@ -20,7 +20,7 @@ Abstract:
- #include <thread>
- #include <mutex>
-
-#if defined(MLAS_TARGET_POWER)
-+#if defined(MLAS_TARGET_POWER)
- #if defined(__linux__)
- #include <sys/auxv.h>
- #elif defined(_AIX)
-@@ -536,7 +536,7 @@ Return Value:
-         this->QNBitGemmDispatch = &MlasSQNBitGemmDispatchNeon;
+@@ -574,7 +574,7 @@ Return Value:
+         this->ConvSymS8S8Dispatch = &MlasConvSymS8DispatchDot;
     }

 -#if defined(__linux__)
@ -137,10 +123,10 @@ index de7fd72fad..4f75dbd6fa 100644
 +#endif
 #endif  // defined(__aarch64__) && defined(__linux__)
 diff --git a/onnxruntime/core/providers/cpu/math/matmul.cc b/onnxruntime/core/providers/cpu/math/matmul.cc
-index 6a71283f9d..d8bd348854 100644
+index 2c6d23e4de..61aaacdfd6 100644
 --- a/onnxruntime/core/providers/cpu/math/matmul.cc
 +++ b/onnxruntime/core/providers/cpu/math/matmul.cc
-@@ -132,7 +132,7 @@ Status MatMul<T>::Compute(OpKernelContext* ctx) const {
+@@ -133,7 +133,7 @@ Status MatMul<T>::Compute(OpKernelContext* ctx) const {

   return Status::OK();
 }
@ -149,7 +135,7 @@ index 6a71283f9d..d8bd348854 100644
 bool GemmPackBBfloat16(AllocatorPtr& alloc,
                        const Tensor& tensor_b,
                        bool trans_b,
-@@ -180,6 +180,7 @@ Status MatMul<float>::PrePack(const Tensor& tensor, int input_idx, /*out*/ Alloc
+@@ -181,6 +181,7 @@ Status MatMul<float>::PrePack(const Tensor& tensor, int input_idx, /*out*/ Alloc
   if (input_idx == 1) {
     size_t packed_b_size;
 #if defined(__aarch64__) && defined(__linux__)
@ -157,7 +143,7 @@ index 6a71283f9d..d8bd348854 100644
     size_t dim1 = 0;
     size_t dim2 = 0;
     TensorShape b_shape = tensor.Shape();
-@@ -192,6 +193,7 @@ Status MatMul<float>::PrePack(const Tensor& tensor, int input_idx, /*out*/ Alloc
+@@ -193,6 +194,7 @@ Status MatMul<float>::PrePack(const Tensor& tensor, int input_idx, /*out*/ Alloc
     if (use_fastmath_mode_ && (trans_b_attr_ == 0) && ((dim1 * dim2) >= kFastMathModeKernelsizeThreshold)) {
       is_packed = GemmPackBBfloat16(alloc, tensor, trans_b_attr_ != 0, packed_b_, packed_b_size, b_shape_);
     } else
@ -165,7 +151,7 @@ index 6a71283f9d..d8bd348854 100644
 #endif
     {
       is_packed = GemmPackBFp32(alloc, tensor, trans_b_attr_ != 0, packed_b_, packed_b_size, b_shape_);
-@@ -257,6 +259,7 @@ Status MatMul<float>::Compute(OpKernelContext* ctx) const {
+@@ -259,6 +261,7 @@ Status MatMul<float>::Compute(OpKernelContext* ctx) const {
   const size_t lda = helper.Lda(trans_a);
   const size_t ldb = helper.Ldb(trans_b);
 #if defined(__aarch64__) && defined(__linux__)
@ -173,7 +159,7 @@ index 6a71283f9d..d8bd348854 100644
   if (use_fastmath_mode_ && !trans_b && ((N * K) >= kFastMathModeKernelsizeThreshold)) {
     std::vector<MLAS_SBGEMM_DATA_PARAMS> data(max_len);
     for (size_t i = 0; i < max_len; i++) {
-@@ -273,6 +276,7 @@ Status MatMul<float>::Compute(OpKernelContext* ctx) const {
+@@ -275,6 +278,7 @@ Status MatMul<float>::Compute(OpKernelContext* ctx) const {
     }
     MlasSBGemmBatch(M, N, K, max_len, data.data(), thread_pool);
   } else