onnxruntime

mirror of https://github.com/saymrwulf/onnxruntime.git synced 2026-07-03 03:58:54 +00:00

History

snadampal 77da2ef278 [aarch64] Add Sbgemm kernel to accelerate fp32 tensor matmul with bfloat16 (#17031 ) ### Description This PR adds SbgemmKernel for aarch64. This includes Sbegmm kernel to implement matrix multiplication with bfloat16 SIMD instructions (bfmmla) and MatMul operator changes to invoke the Sbgemm kernel. To enable Sbgemm kernel, set the following session option: "kOrtSessionOptionsGemmFastMathMode" The PR also adds new test cases for mlas and ort. ### Motivation and Context This is to improve MatMul performance on aarch64 platform. I have run the below benchmarking script (bert , roberta and gpt2 model inference) on AWS Graviton3 based c7g.4xl instance and observed 1.2x -1.76x performance improvement compared to sgemm (fp32) kernel performance. ``` cd onnxruntime/python/tools/transformers python3 benchmark.py ``` And the unit test precision results are matching to sgemm kernel results. `./build.sh --config RelWithDebInfo --build_shared_lib --parallel --compile_no_warning_as_error --skip_submodule_sync `		2024-01-22 14:43:06 -08:00
..
external	Download protoc for all Apple host builds, remove protoc build from iOS packaging pipeline. (#19209 )	2024-01-19 15:30:09 -08:00
patches	Update absl and gtest to fix an ARM64EC build error (#18735 )	2023-12-07 15:55:17 -08:00
tensorboard
adjust_global_compile_flags.cmake	[WebNN EP] Fixed build issue with disable_rtti (#19173 )	2024-01-16 21:35:13 -08:00
arm64x.cmake	Build onnxruntime.dll as arm64x (#18633 )	2023-12-06 16:49:00 -08:00
CMakeLists.txt	Update x64 template kernel library for 'sqnbitgemm' (#19016 )	2024-01-18 13:16:34 -08:00
CMakeSettings.json
codeconv.runsettings	CMake changes (#2961 )	2020-02-03 19:33:14 -08:00
deps.txt	Update x64 template kernel library for 'sqnbitgemm' (#19016 )	2024-01-18 13:16:34 -08:00
deps_update_and_upload.py	[Linter] Bump ruff and remove pylint (#17797 )	2023-10-05 21:07:33 -07:00
EnableVisualStudioCodeAnalysis.props
gdk_toolchain.cmake
Info.plist.in
libonnxruntime.pc.cmake.in
linux_arm32_crosscompile_toolchain.cmake	Add a build validation for Linux ARM64 cross-compile (#18200 )	2023-11-08 13:03:18 -08:00
linux_arm64_crosscompile_toolchain.cmake	Add a build validation for Linux ARM64 cross-compile (#18200 )	2023-11-08 13:03:18 -08:00
nuget_helpers.cmake
onnxruntime.cmake	Add MacOS build to ORT C Pod (#18550 )	2023-11-28 10:11:53 -08:00
onnxruntime_codegen_tvm.cmake
onnxruntime_common.cmake	Update C/C++ dependencies: abseil, date, nsync, googletest, wil, mp11, cpuinfo and safeint (#15470 )	2023-09-08 13:35:04 -07:00
onnxruntime_compile_triton_kernel.cmake
onnxruntime_config.h.in	Enabling c++ 20 in MacOS build (#16187 )	2023-09-26 11:27:02 -07:00
onnxruntime_csharp.cmake
onnxruntime_flatbuffers.cmake
onnxruntime_framework.cmake
onnxruntime_framework.natvis
onnxruntime_fuzz_test.cmake
onnxruntime_graph.cmake	Pre-link when creating static library for apple framework (#18241 )	2023-11-03 23:38:29 +10:00
onnxruntime_ios.toolchain.cmake
onnxruntime_java.cmake
onnxruntime_java_unittests.cmake
onnxruntime_kernel_explorer.cmake
onnxruntime_language_interop_ops.cmake
onnxruntime_mlas.cmake	[aarch64] Add Sbgemm kernel to accelerate fp32 tensor matmul with bfloat16 (#17031 )	2024-01-22 14:43:06 -08:00
onnxruntime_nodejs.cmake	Added DML and CUDA provider support in onnxruntime-node (#16050 )	2023-08-25 16:57:06 -07:00
onnxruntime_objectivec.cmake
onnxruntime_opschema_lib.cmake
onnxruntime_optimizer.cmake	[ROCm] Fix hipify error: fast_divmod.h: No such file or directory (#19060 )	2024-01-10 14:49:19 +08:00
onnxruntime_providers.cmake	Add API for NPU Device Selection in the DML EP (#17612 )	2023-10-11 14:53:00 -07:00
onnxruntime_providers_acl.cmake	Split onnxruntime_providers.cmake to multiple (#17853 )	2023-10-09 20:33:44 -07:00
onnxruntime_providers_armnn.cmake	Split onnxruntime_providers.cmake to multiple (#17853 )	2023-10-09 20:33:44 -07:00
onnxruntime_providers_azure.cmake	Split onnxruntime_providers.cmake to multiple (#17853 )	2023-10-09 20:33:44 -07:00
onnxruntime_providers_cann.cmake	Split onnxruntime_providers.cmake to multiple (#17853 )	2023-10-09 20:33:44 -07:00
onnxruntime_providers_coreml.cmake	Split onnxruntime_providers.cmake to multiple (#17853 )	2023-10-09 20:33:44 -07:00
onnxruntime_providers_cpu.cmake	Update x64 template kernel library for 'sqnbitgemm' (#19016 )	2024-01-18 13:16:34 -08:00
onnxruntime_providers_cuda.cmake	[TensorRT EP] Enable a minimal CUDA EP compilation without kernels (#19052 )	2024-01-17 11:33:34 -08:00
onnxruntime_providers_dml.cmake	Delay load dxcore.dll in addition to ext-ms-win-dxcore-l1-1-0.dll (#18913 )	2023-12-26 12:33:42 -08:00
onnxruntime_providers_dnnl.cmake	Split onnxruntime_providers.cmake to multiple (#17853 )	2023-10-09 20:33:44 -07:00
onnxruntime_providers_js.cmake	Split onnxruntime_providers.cmake to multiple (#17853 )	2023-10-09 20:33:44 -07:00
onnxruntime_providers_migraphx.cmake	CUDA EP vs ROCM EP hipify audit (#17776 )	2023-10-13 10:13:53 +08:00
onnxruntime_providers_nnapi.cmake	Split onnxruntime_providers.cmake to multiple (#17853 )	2023-10-09 20:33:44 -07:00
onnxruntime_providers_openvino.cmake	Split onnxruntime_providers.cmake to multiple (#17853 )	2023-10-09 20:33:44 -07:00
onnxruntime_providers_qnn.cmake	Split onnxruntime_providers.cmake to multiple (#17853 )	2023-10-09 20:33:44 -07:00
onnxruntime_providers_rknpu.cmake	Split onnxruntime_providers.cmake to multiple (#17853 )	2023-10-09 20:33:44 -07:00
onnxruntime_providers_rocm.cmake	CUDA EP vs ROCM EP hipify audit (#17776 )	2023-10-13 10:13:53 +08:00
onnxruntime_providers_tensorrt.cmake	[TensorRT EP] Properly set CUDA_INCLUDE_DIR for onnx-tensorrt (#18274 )	2023-11-03 20:04:10 -07:00
onnxruntime_providers_tvm.cmake	Split onnxruntime_providers.cmake to multiple (#17853 )	2023-10-09 20:33:44 -07:00
onnxruntime_providers_vitisai.cmake	[VitisAI] 1. api compatbile 2. dynamic load onnx (#18470 )	2023-12-14 14:43:41 -08:00
onnxruntime_providers_webnn.cmake	Split onnxruntime_providers.cmake to multiple (#17853 )	2023-10-09 20:33:44 -07:00
onnxruntime_providers_xnnpack.cmake	Update XNNPACK to latest version (#18038 )	2023-11-03 09:04:28 -07:00
onnxruntime_pyop.cmake
onnxruntime_python.cmake	Remove DORT since it's in PyTorch main now (#18996 )	2024-01-04 12:59:47 -08:00
onnxruntime_rocm_hipify.cmake	MoE with Expert Slicing (#18565 )	2023-12-05 16:56:38 -08:00
onnxruntime_session.cmake
onnxruntime_snpe_provider.cmake
onnxruntime_training.cmake
onnxruntime_unittests.cmake	update to emsdk-3.1.51 (#18844 )	2024-01-12 16:04:33 -08:00
onnxruntime_util.cmake
onnxruntime_webassembly.cmake	[WebNN EP] Fixed build issue with disable_rtti (#19173 )	2024-01-16 21:35:13 -08:00
precompiled_header.cmake
Sdl.ruleset
set_winapi_family_desktop.h
target_delayload.cmake
uwp_stubs.h
wcos_rules_override.cmake
winml.cmake	Update winml to use #cores - #soc cores by Default as the number of intraopthreads (#18384 )	2023-11-28 09:26:48 -08:00
winml_cppwinrt.cmake
winml_sdk_helpers.cmake
winml_unittests.cmake	Update C/C++ dependencies: abseil, date, nsync, googletest, wil, mp11, cpuinfo and safeint (#15470 )	2023-09-08 13:35:04 -07:00