mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-22 22:01:08 +00:00
### Description Using Intel AMX int8 instructions to accelerate quantized GEMM ### Motivation and Context AMX instructions accelerate quantized GEMM significantly: Prepacked B perf numbers (latency in ns) GEMM Config | AVX512Vnni | AMX -- | --: | --: M:384/N:1024/K:1024/Batch:1/Threads:4 | 1057511 | 285393 M:384/N:1024/K:3072/Batch:1/Threads:4 | 2643929 | 700397 M:384/N:1024/K:4096/Batch:1/Threads:4 | 3784750 | 890701 M:384/N:4096/K:1024/Batch:1/Threads:4 | 2378139 | 887251 M:384/N:1024/K:1024/Batch:1/Threads:16 | 307137 | 138481 M:384/N:1024/K:3072/Batch:1/Threads:16 | 855730 | 295027 M:384/N:1024/K:4096/Batch:1/Threads:16 | 1126878 | 317395 M:384/N:4096/K:1024/Batch:1/Threads:16 | 781963 | 237014 M:1536/N:1024/K:1024/Batch:1/Threads:16 | 538864 | 181459 M:1536/N:1024/K:3072/Batch:1/Threads:16 | 1681002 | 561600 M:1536/N:1024/K:4096/Batch:1/Threads:16 | 2158127 | 717470 M:1536/N:4096/K:1024/Batch:1/Threads:16 | 2428622 | 896140 M:3072/N:1024/K:1024/Batch:1/Threads:16 | 1058029 | 357031 M:3072/N:1024/K:3072/Batch:1/Threads:16 | 3138504 | 1095857 M:3072/N:1024/K:4096/Batch:1/Threads:16 | 4155640 | 1386183 M:3072/N:4096/K:1024/Batch:1/Threads:16 | 4679030 | 1778624 Co-authored-by: Yi-Hong Lyu <yilyu@microsoft.com> Co-authored-by: Chen Fu <fuchen@microsoft.com> |
||
|---|---|---|
| .. | ||
| external | ||
| patches | ||
| tensorboard | ||
| adjust_global_compile_flags.cmake | ||
| CMakeLists.txt | ||
| CMakeSettings.json | ||
| codeconv.runsettings | ||
| deps.txt | ||
| EnableVisualStudioCodeAnalysis.props | ||
| gdk_toolchain.cmake | ||
| Info.plist.in | ||
| libonnxruntime.pc.cmake.in | ||
| nuget_helpers.cmake | ||
| onnxruntime.cmake | ||
| onnxruntime_codegen_tvm.cmake | ||
| onnxruntime_common.cmake | ||
| onnxruntime_config.h.in | ||
| onnxruntime_csharp.cmake | ||
| onnxruntime_eager.cmake | ||
| onnxruntime_flatbuffers.cmake | ||
| onnxruntime_framework.cmake | ||
| onnxruntime_fuzz_test.cmake | ||
| onnxruntime_graph.cmake | ||
| onnxruntime_ios.toolchain.cmake | ||
| onnxruntime_java.cmake | ||
| onnxruntime_java_unittests.cmake | ||
| onnxruntime_kernel_explorer.cmake | ||
| onnxruntime_language_interop_ops.cmake | ||
| onnxruntime_mlas.cmake | ||
| onnxruntime_nodejs.cmake | ||
| onnxruntime_objectivec.cmake | ||
| onnxruntime_opschema_lib.cmake | ||
| onnxruntime_optimizer.cmake | ||
| onnxruntime_providers.cmake | ||
| onnxruntime_pyop.cmake | ||
| onnxruntime_python.cmake | ||
| onnxruntime_rocm_hipify.cmake | ||
| onnxruntime_session.cmake | ||
| onnxruntime_snpe_provider.cmake | ||
| onnxruntime_training.cmake | ||
| onnxruntime_unittests.cmake | ||
| onnxruntime_util.cmake | ||
| onnxruntime_webassembly.cmake | ||
| precompiled_header.cmake | ||
| Sdl.ruleset | ||
| set_winapi_family_desktop.h | ||
| target_delayload.cmake | ||
| uwp_stubs.h | ||
| wcos_rules_override.cmake | ||
| winml.cmake | ||
| winml_cppwinrt.cmake | ||
| winml_sdk_helpers.cmake | ||
| winml_unittests.cmake | ||