diff --git a/onnxruntime/core/providers/vsinpu/patches/mlas_crosscompiling.patch b/onnxruntime/core/providers/vsinpu/patches/mlas_crosscompiling.patch index f55b593fd8..95a4e4650e 100644 --- a/onnxruntime/core/providers/vsinpu/patches/mlas_crosscompiling.patch +++ b/onnxruntime/core/providers/vsinpu/patches/mlas_crosscompiling.patch @@ -1,8 +1,8 @@ diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake -index c02ac2096d..2bc51298f0 100644 +index 10c307b3b9..a52bf71c4d 100644 --- a/cmake/onnxruntime_mlas.cmake +++ b/cmake/onnxruntime_mlas.cmake -@@ -361,7 +361,7 @@ else() +@@ -370,7 +370,7 @@ else() ) set_source_files_properties(${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+dotprod") @@ -12,10 +12,10 @@ index c02ac2096d..2bc51298f0 100644 ${mlas_platform_srcs} ${MLAS_SRC_DIR}/aarch64/HalfGemmKernelNeon.S diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h -index e46105324a..414c46a1ce 100644 +index 28ae64c4d5..0c77e0ca78 100644 --- a/onnxruntime/core/mlas/inc/mlas.h +++ b/onnxruntime/core/mlas/inc/mlas.h -@@ -82,6 +82,9 @@ Abstract: +@@ -83,6 +83,9 @@ Abstract: #if (!defined(_MSC_VER)) || (_MSC_VER >= 1930) #if defined(MLAS_TARGET_ARM64) || defined(MLAS_TARGET_ARM64EC) @@ -25,7 +25,7 @@ index e46105324a..414c46a1ce 100644 #if !defined(__APPLE__) // Had to temporary disable fp16 under APPLE ARM64, as compiling // the source files require a hardware specific compilation flag. -@@ -90,6 +93,7 @@ Abstract: +@@ -91,6 +94,7 @@ Abstract: #define MLAS_F16VEC_INTRINSICS_SUPPORTED @@ -33,7 +33,7 @@ index e46105324a..414c46a1ce 100644 #endif // #endif // ARM64 #endif // Visual Studio 16 or earlier does not support fp16 intrinsic -@@ -1635,6 +1639,7 @@ MlasHalfGemmConvertPackB( +@@ -1644,6 +1648,7 @@ MlasHalfGemmConvertPackB( ); #if defined(__aarch64__) && defined(__linux__) @@ -41,7 +41,7 @@ index e46105324a..414c46a1ce 100644 /** * @brief Whether current CPU supports Bfloat16(bf16) acceleration. */ -@@ -1746,6 +1751,7 @@ MlasSBGemmPackBSize(size_t N, size_t K); +@@ -1755,6 +1760,7 @@ MlasSBGemmPackBSize(size_t N, size_t K); void MLASCALL MlasSBGemmConvertPackB(size_t N, size_t K, const float* B, size_t ldb, void* PackedB); #endif @@ -50,10 +50,10 @@ index e46105324a..414c46a1ce 100644 /** * @brief Indirect Depthwise convolution for fp16 diff --git a/onnxruntime/core/mlas/lib/mlasi.h b/onnxruntime/core/mlas/lib/mlasi.h -index 4239e2ecae..3df7e5573d 100644 +index 0533a5e49b..c18bf7f90d 100644 --- a/onnxruntime/core/mlas/lib/mlasi.h +++ b/onnxruntime/core/mlas/lib/mlasi.h -@@ -361,6 +361,7 @@ size_t +@@ -377,6 +377,7 @@ size_t #else #if defined(__aarch64__) && defined(__linux__) @@ -61,7 +61,7 @@ index 4239e2ecae..3df7e5573d 100644 typedef size_t(MLASCALL MLAS_SBGEMM_FLOAT_KERNEL)( const float* A, const bfloat16_t* B, -@@ -373,6 +374,7 @@ typedef size_t(MLASCALL MLAS_SBGEMM_FLOAT_KERNEL)( +@@ -389,6 +390,7 @@ typedef size_t(MLASCALL MLAS_SBGEMM_FLOAT_KERNEL)( const float* Bias ); #endif @@ -69,7 +69,7 @@ index 4239e2ecae..3df7e5573d 100644 typedef size_t -@@ -763,8 +765,10 @@ extern "C" { +@@ -796,8 +798,10 @@ extern "C" { MLAS_GEMM_FLOAT_KERNEL MlasSgemmKernelZero; MLAS_GEMM_FLOAT_KERNEL MlasSgemmKernelAdd; #if defined(__aarch64__) && defined(__linux__) @@ -80,7 +80,7 @@ index 4239e2ecae..3df7e5573d 100644 #endif MLAS_GEMM_DOUBLE_KERNEL MlasDgemmKernelZero; MLAS_GEMM_DOUBLE_KERNEL MlasDgemmKernelAdd; -@@ -899,8 +903,10 @@ extern "C" { +@@ -946,8 +950,10 @@ extern "C" { #define MLAS_QGEMM_THREAD_COMPLEXITY 65536 #if defined(__aarch64__) && defined(__linux__) @@ -91,26 +91,12 @@ index 4239e2ecae..3df7e5573d 100644 // // Single-threaded single precision matrix/matrix multiply operation. -@@ -2570,4 +2576,3 @@ MlasPackInt4Elements(uint8_t* Output, UnpackedType ValueLow, UnpackedType ValueH - static_assert(std::is_same_v || std::is_same_v); - *Output = static_cast(((ValueHigh & 0xF) << 4) | (ValueLow & 0xF)); - } -- diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp -index ed437f20f7..8c9d0a75fd 100644 +index b3c9461293..424c3b0441 100644 --- a/onnxruntime/core/mlas/lib/platform.cpp +++ b/onnxruntime/core/mlas/lib/platform.cpp -@@ -20,7 +20,7 @@ Abstract: - #include - #include - --#if defined(MLAS_TARGET_POWER) -+#if defined(MLAS_TARGET_POWER) - #if defined(__linux__) - #include - #elif defined(_AIX) -@@ -536,7 +536,7 @@ Return Value: - this->QNBitGemmDispatch = &MlasSQNBitGemmDispatchNeon; +@@ -574,7 +574,7 @@ Return Value: + this->ConvSymS8S8Dispatch = &MlasConvSymS8DispatchDot; } -#if defined(__linux__) @@ -137,10 +123,10 @@ index de7fd72fad..4f75dbd6fa 100644 +#endif #endif // defined(__aarch64__) && defined(__linux__) diff --git a/onnxruntime/core/providers/cpu/math/matmul.cc b/onnxruntime/core/providers/cpu/math/matmul.cc -index 6a71283f9d..d8bd348854 100644 +index 2c6d23e4de..61aaacdfd6 100644 --- a/onnxruntime/core/providers/cpu/math/matmul.cc +++ b/onnxruntime/core/providers/cpu/math/matmul.cc -@@ -132,7 +132,7 @@ Status MatMul::Compute(OpKernelContext* ctx) const { +@@ -133,7 +133,7 @@ Status MatMul::Compute(OpKernelContext* ctx) const { return Status::OK(); } @@ -149,7 +135,7 @@ index 6a71283f9d..d8bd348854 100644 bool GemmPackBBfloat16(AllocatorPtr& alloc, const Tensor& tensor_b, bool trans_b, -@@ -180,6 +180,7 @@ Status MatMul::PrePack(const Tensor& tensor, int input_idx, /*out*/ Alloc +@@ -181,6 +181,7 @@ Status MatMul::PrePack(const Tensor& tensor, int input_idx, /*out*/ Alloc if (input_idx == 1) { size_t packed_b_size; #if defined(__aarch64__) && defined(__linux__) @@ -157,7 +143,7 @@ index 6a71283f9d..d8bd348854 100644 size_t dim1 = 0; size_t dim2 = 0; TensorShape b_shape = tensor.Shape(); -@@ -192,6 +193,7 @@ Status MatMul::PrePack(const Tensor& tensor, int input_idx, /*out*/ Alloc +@@ -193,6 +194,7 @@ Status MatMul::PrePack(const Tensor& tensor, int input_idx, /*out*/ Alloc if (use_fastmath_mode_ && (trans_b_attr_ == 0) && ((dim1 * dim2) >= kFastMathModeKernelsizeThreshold)) { is_packed = GemmPackBBfloat16(alloc, tensor, trans_b_attr_ != 0, packed_b_, packed_b_size, b_shape_); } else @@ -165,7 +151,7 @@ index 6a71283f9d..d8bd348854 100644 #endif { is_packed = GemmPackBFp32(alloc, tensor, trans_b_attr_ != 0, packed_b_, packed_b_size, b_shape_); -@@ -257,6 +259,7 @@ Status MatMul::Compute(OpKernelContext* ctx) const { +@@ -259,6 +261,7 @@ Status MatMul::Compute(OpKernelContext* ctx) const { const size_t lda = helper.Lda(trans_a); const size_t ldb = helper.Ldb(trans_b); #if defined(__aarch64__) && defined(__linux__) @@ -173,7 +159,7 @@ index 6a71283f9d..d8bd348854 100644 if (use_fastmath_mode_ && !trans_b && ((N * K) >= kFastMathModeKernelsizeThreshold)) { std::vector data(max_len); for (size_t i = 0; i < max_len; i++) { -@@ -273,6 +276,7 @@ Status MatMul::Compute(OpKernelContext* ctx) const { +@@ -275,6 +278,7 @@ Status MatMul::Compute(OpKernelContext* ctx) const { } MlasSBGemmBatch(M, N, K, max_len, data.data(), thread_pool); } else