mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-18 21:21:17 +00:00
### Description <!-- Describe your changes. --> This commit introduces a new vectorized AVX512F kernel, MlasReduceMaximumF32KernelAvx512F, which efficiently computes the maximum value of the supplied buffer. Additionally, microbenchmarks have been added for MlasComputeSoftmax (inplace), MlasReduceMaximumF32KernelAvx, MlasComputeSumExpF32KernelAvx512F, and MlasComputeSoftmaxOutputF32KernelAvx. ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. --> The goal of this commit is to enhance the performance of ReduceMaximumF32Kernel on CPUs with AVX512F instruction support. | AVX | | | AVX512 | | | -- | -- | -- | -- | -- | -- | -- | -- name | iterations | real_time | cpu_time | iterations | real_time | cpu_time | time_unit REDUCEMAXIMUMF32KERNEL[]/ByteAligned:4/D:3/real_time | 271277304 | 2.58095 | 2.58091 | 263338132 | 2.65661 | 2.65661 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:8/D:3/real_time | 271220477 | 2.58095 | 2.58095 | 263509929 | 2.65652 | 2.65649 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:16/D:3/real_time | 271240587 | 2.58064 | 2.58064 | 263479542 | 2.65671 | 2.65665 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:32/D:3/real_time | 271227745 | 2.58083 | 2.58079 | 263402506 | 2.65657 | 2.65657 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:64/D:3/real_time | 271255069 | 2.58073 | 2.58071 | 263463858 | 2.65682 | 2.65682 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:128/D:3/real_time | 271257174 | 2.58058 | 2.58052 | 263460120 | 2.65682 | 2.65682 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:4/D:4/real_time | 174395051 | 4.01401 | 4.01401 | 197330481 | 3.5465 | 3.54636 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:8/D:4/real_time | 174645502 | 3.99691 | 3.99691 | 197474831 | 3.54298 | 3.54278 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:16/D:4/real_time | 174523308 | 4.01391 | 4.01386 | 197389981 | 3.54518 | 3.54506 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:32/D:4/real_time | 174779200 | 3.99874 | 3.99874 | 197519075 | 3.54227 | 3.54209 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:64/D:4/real_time | 174642874 | 4.00645 | 4.00641 | 197642101 | 3.54195 | 3.54188 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:128/D:4/real_time | 174546754 | 4.0061 | 4.00608 | 197621033 | 3.54296 | 3.54281 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:4/D:5/real_time | 162752651 | 4.30119 | 4.30114 | 215552503 | 3.24767 | 3.24752 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:8/D:5/real_time | 162717463 | 4.30123 | 4.30116 | 215541082 | 3.24711 | 3.24695 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:16/D:5/real_time | 162718819 | 4.3016 | 4.30153 | 215589239 | 3.24725 | 3.24708 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:32/D:5/real_time | 162719596 | 4.30151 | 4.30145 | 215563846 | 3.24956 | 3.24949 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:64/D:5/real_time | 162753333 | 4.30125 | 4.30125 | 215537315 | 3.24924 | 3.24908 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:128/D:5/real_time | 162752258 | 4.3014 | 4.30141 | 215526482 | 3.24744 | 3.24735 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:4/D:7/real_time | 143579660 | 4.87526 | 4.87516 | 100000000 | 5.25767 | 5.25752 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:8/D:7/real_time | 143585097 | 4.87476 | 4.87467 | 100000000 | 5.41583 | 5.41567 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:16/D:7/real_time | 143571011 | 4.87506 | 4.87503 | 182359467 | 3.83773 | 3.83764 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:32/D:7/real_time | 143587142 | 4.87487 | 4.8748 | 182397261 | 3.83807 | 3.8379 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:64/D:7/real_time | 143578465 | 4.87525 | 4.87521 | 182428602 | 3.83777 | 3.83768 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:128/D:7/real_time | 143588555 | 4.87491 | 4.87488 | 125280452 | 5.59791 | 5.59766 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:4/D:9/real_time | 284851058 | 2.43476 | 2.43476 | 156879863 | 4.42895 | 4.42884 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:8/D:9/real_time | 270700898 | 2.59031 | 2.59024 | 157953114 | 4.42995 | 4.42968 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:16/D:9/real_time | 282871172 | 2.45385 | 2.45385 | 157801156 | 4.42817 | 4.42804 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:32/D:9/real_time | 285307738 | 2.47009 | 2.47005 | 158058507 | 4.4279 | 4.42786 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:64/D:9/real_time | 285709536 | 2.45481 | 2.45476 | 158070961 | 4.42809 | 4.42799 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:128/D:9/real_time | 285449733 | 2.47495 | 2.47491 | 158069718 | 4.45026 | 4.45017 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:4/D:11/real_time | 189213618 | 3.79684 | 3.79676 | 139459497 | 5.01882 | 5.01871 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:8/D:11/real_time | 185600468 | 3.76394 | 3.76376 | 139444892 | 5.01922 | 5.01905 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:16/D:11/real_time | 184968668 | 3.80636 | 3.80636 | 139470834 | 5.01948 | 5.01936 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:32/D:11/real_time | 183867226 | 3.80432 | 3.80427 | 139481986 | 5.01975 | 5.01944 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:64/D:11/real_time | 184301650 | 3.81634 | 3.81634 | 139452846 | 5.01983 | 5.01972 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:128/D:11/real_time | 186215795 | 3.82659 | 3.82654 | 139497736 | 5.02119 | 5.02113 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:4/D:13/real_time | 135622415 | 5.16256 | 5.16252 | 124661337 | 5.61227 | 5.61194 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:8/D:13/real_time | 135618907 | 5.15967 | 5.1596 | 124805224 | 5.6088 | 5.60854 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:16/D:13/real_time | 135612192 | 5.15506 | 5.15501 | 124803221 | 5.60901 | 5.60869 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:32/D:13/real_time | 135906082 | 5.15818 | 5.15818 | 124776601 | 5.60898 | 5.60886 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:64/D:13/real_time | 135369523 | 5.15709 | 5.15682 | 124790370 | 5.60927 | 5.60902 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:128/D:13/real_time | 135596827 | 5.1603 | 5.1603 | 124792145 | 5.61637 | 5.61614 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:4/D:15/real_time | 110947137 | 5.96511 | 5.96495 | 112861522 | 6.20035 | 6.20014 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:8/D:15/real_time | 118004792 | 6.22645 | 6.22628 | 112909900 | 6.20073 | 6.20073 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:16/D:15/real_time | 112630319 | 6.25564 | 6.25552 | 112874563 | 6.19932 | 6.19924 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:32/D:15/real_time | 117403034 | 6.17263 | 6.17258 | 112927318 | 6.19866 | 6.19842 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:64/D:15/real_time | 108921863 | 6.48624 | 6.48612 | 112927746 | 6.20057 | 6.20026 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:128/D:15/real_time | 110358148 | 6.66805 | 6.66789 | 112907312 | 6.19938 | 6.19908 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:4/D:16/real_time | 203419574 | 3.4415 | 3.44137 | 237134525 | 2.95649 | 2.95638 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:8/D:16/real_time | 203414035 | 3.4411 | 3.44099 | 237129564 | 2.95178 | 2.95171 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:16/D:16/real_time | 203404068 | 3.44157 | 3.44151 | 236981704 | 2.9518 | 2.95167 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:32/D:16/real_time | 203391471 | 3.44146 | 3.44137 | 237108807 | 2.95203 | 2.95196 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:64/D:16/real_time | 203393801 | 3.44131 | 3.44127 | 237126460 | 2.95278 | 2.95272 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:128/D:16/real_time | 203407476 | 3.44181 | 3.44162 | 237154444 | 2.95293 | 2.9528 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:4/D:500/real_time | 37551439 | 18.6407 | 18.6407 | 39222534 | 17.858 | 17.8571 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:8/D:500/real_time | 37544097 | 18.6404 | 18.6401 | 39174151 | 17.8539 | 17.8536 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:16/D:500/real_time | 37549837 | 18.6391 | 18.6391 | 39233956 | 17.8507 | 17.8505 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:32/D:500/real_time | 45996345 | 15.2157 | 15.2153 | 39285929 | 17.848 | 17.8474 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:64/D:500/real_time | 46012429 | 15.2184 | 15.2179 | 65664865 | 10.7366 | 10.7364 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:128/D:500/real_time | 45912375 | 15.2349 | 15.2346 | 65205908 | 10.8498 | 10.8492 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:4/D:2000/real_time | 9493955 | 73.7232 | 73.7203 | 10188090 | 68.7931 | 68.7908 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:8/D:2000/real_time | 9495562 | 73.7173 | 73.7173 | 10180895 | 68.7533 | 68.7511 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:16/D:2000/real_time | 9487371 | 73.7852 | 73.7831 | 10164473 | 68.7279 | 68.725 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:32/D:2000/real_time | 10816047 | 64.7322 | 64.7287 | 10168481 | 68.8109 | 68.8096 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:64/D:2000/real_time | 10808802 | 64.7232 | 64.721 | 19478320 | 36.1471 | 36.1461 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:128/D:2000/real_time | 10818192 | 64.7304 | 64.728 | 19419672 | 35.9635 | 35.9635 | ns |
||
|---|---|---|
| .. | ||
| external | ||
| patches | ||
| tensorboard | ||
| adjust_global_compile_flags.cmake | ||
| arm64x.cmake | ||
| CMakeLists.txt | ||
| CMakeSettings.json | ||
| codeconv.runsettings | ||
| deps.txt | ||
| deps_update_and_upload.py | ||
| EnableVisualStudioCodeAnalysis.props | ||
| gdk_toolchain.cmake | ||
| Info.plist.in | ||
| libonnxruntime.pc.cmake.in | ||
| linux_arm32_crosscompile_toolchain.cmake | ||
| linux_arm64_crosscompile_toolchain.cmake | ||
| maccatalyst_prepare_objects_for_prelink.py | ||
| nuget_helpers.cmake | ||
| onnxruntime.cmake | ||
| onnxruntime_codegen_tvm.cmake | ||
| onnxruntime_common.cmake | ||
| onnxruntime_compile_triton_kernel.cmake | ||
| onnxruntime_config.h.in | ||
| onnxruntime_csharp.cmake | ||
| onnxruntime_flatbuffers.cmake | ||
| onnxruntime_framework.cmake | ||
| onnxruntime_framework.natvis | ||
| onnxruntime_fuzz_test.cmake | ||
| onnxruntime_graph.cmake | ||
| onnxruntime_ios.toolchain.cmake | ||
| onnxruntime_java.cmake | ||
| onnxruntime_java_unittests.cmake | ||
| onnxruntime_kernel_explorer.cmake | ||
| onnxruntime_language_interop_ops.cmake | ||
| onnxruntime_mlas.cmake | ||
| onnxruntime_nodejs.cmake | ||
| onnxruntime_objectivec.cmake | ||
| onnxruntime_opschema_lib.cmake | ||
| onnxruntime_optimizer.cmake | ||
| onnxruntime_providers.cmake | ||
| onnxruntime_providers_acl.cmake | ||
| onnxruntime_providers_armnn.cmake | ||
| onnxruntime_providers_azure.cmake | ||
| onnxruntime_providers_cann.cmake | ||
| onnxruntime_providers_coreml.cmake | ||
| onnxruntime_providers_cpu.cmake | ||
| onnxruntime_providers_cuda.cmake | ||
| onnxruntime_providers_dml.cmake | ||
| onnxruntime_providers_dnnl.cmake | ||
| onnxruntime_providers_js.cmake | ||
| onnxruntime_providers_migraphx.cmake | ||
| onnxruntime_providers_nnapi.cmake | ||
| onnxruntime_providers_openvino.cmake | ||
| onnxruntime_providers_qnn.cmake | ||
| onnxruntime_providers_rknpu.cmake | ||
| onnxruntime_providers_rocm.cmake | ||
| onnxruntime_providers_tensorrt.cmake | ||
| onnxruntime_providers_tvm.cmake | ||
| onnxruntime_providers_vitisai.cmake | ||
| onnxruntime_providers_webnn.cmake | ||
| onnxruntime_providers_xnnpack.cmake | ||
| onnxruntime_pyop.cmake | ||
| onnxruntime_python.cmake | ||
| onnxruntime_rocm_hipify.cmake | ||
| onnxruntime_session.cmake | ||
| onnxruntime_snpe_provider.cmake | ||
| onnxruntime_training.cmake | ||
| onnxruntime_unittests.cmake | ||
| onnxruntime_util.cmake | ||
| onnxruntime_webassembly.cmake | ||
| precompiled_header.cmake | ||
| riscv64.toolchain.cmake | ||
| Sdl.ruleset | ||
| set_winapi_family_desktop.h | ||
| target_delayload.cmake | ||
| uwp_stubs.h | ||
| wcos_rules_override.cmake | ||
| winml.cmake | ||
| winml_cppwinrt.cmake | ||
| winml_sdk_helpers.cmake | ||
| winml_unittests.cmake | ||