add arm64 bfloat16 fastmath mode option for transformers benchmarking script (#19294)

Add arm64 bfloat16 fastmath mode option for transformers benchmarking script. ### Motivation and Context onnxruntime now supports bfloat16 fastmath gemm kernels for arm64 platforms with bfloat16 instruction support. This PR updates benchmark scripts to test that mode.
2026-07-16 18:31:27 +00:00 · 2024-02-12 17:20:36 -06:00 · 2024-02-12 17:20:36 -06:00 · 7fa6f4fca4
commit 7fa6f4fca4
parent 90e2e8561f
3 changed files with 25 additions and 1 deletions
--- a/onnxruntime/python/tools/transformers/benchmark.py
+++ b/onnxruntime/python/tools/transformers/benchmark.py
@ -36,6 +36,8 @@
            python benchmark.py -e torchscript onnxruntime -p "int8" -o
        Run OnnxRuntime with the ROCM provider and graph optimization script:
            python benchmark.py -g -m bert-base-cased --provider rocm --optimizer_info by_script --disable_embed_layer_norm
+        Run OnnxRuntime with bfloat16 fastmath mode kernels on aarch64 platforms with bfloat16 support:
+            python benchmark.py --enable_arm64_bfloat16_fastmath_mlas_gemm

    It is recommended to use run_benchmark.sh to launch benchmark.
 """
@ -106,6 +108,7 @@ def run_onnxruntime(
    use_raw_attention_mask,
    model_fusion_statistics,
    model_source,
+    enable_arm64_bfloat16_fastmath_mlas_gemm,
    args,
 ):
    import onnxruntime
@ -209,6 +212,7 @@ def run_onnxruntime(
                enable_all_optimization=True,
                num_threads=num_threads,
                verbose=verbose,
+                enable_mlas_gemm_fastmath_arm64_bfloat16=enable_arm64_bfloat16_fastmath_mlas_gemm,
            )
            if ort_session is None:
                continue
@ -760,6 +764,14 @@ def parse_arguments():
        help="Manually set the model's layer number",
    )

+    parser.add_argument(
+        "--enable_arm64_bfloat16_fastmath_mlas_gemm",
+        required=False,
+        action="store_true",
+        help="Enable bfloat16 mlas gemm kernels on aarch64. Supported only for CPU EP ",
+    )
+    parser.set_defaults(enable_arm64_bfloat16_fastmath_mlas_gemm=False)
+
    FusionOptions.add_arguments(parser)

    args = parser.parse_args()
@ -905,6 +917,7 @@ def main():
                    use_raw_attention_mask,
                    model_fusion_statistics,
                    args.model_source,
+                    args.enable_arm64_bfloat16_fastmath_mlas_gemm,
                    args,
                )
            except Exception:
--- a/onnxruntime/python/tools/transformers/benchmark_helper.py
+++ b/onnxruntime/python/tools/transformers/benchmark_helper.py
@ -85,6 +85,7 @@ def create_onnxruntime_session(
    num_threads=-1,
    enable_profiling=False,
    verbose=False,
+    enable_mlas_gemm_fastmath_arm64_bfloat16=False,
    provider_options={},  # map execution provider name to its option  # noqa: B006
 ):
    session = None
@ -136,6 +137,9 @@ def create_onnxruntime_session(
        if provider_options:
            providers = [(name, provider_options[name]) if name in provider_options else name for name in providers]

+        if enable_mlas_gemm_fastmath_arm64_bfloat16:
+            sess_options.add_session_config_entry("mlas.enable_gemm_fastmath_arm64_bfloat16", "1")
+
        session = onnxruntime.InferenceSession(onnx_model_path, sess_options, providers=providers)
    except Exception:
        logger.error("Exception", exc_info=True)
--- a/onnxruntime/python/tools/transformers/run_benchmark.sh
+++ b/onnxruntime/python/tools/transformers/run_benchmark.sh
@ -34,6 +34,9 @@ run_gpu_fp16=true
 run_cpu_fp32=false
 run_cpu_int8=false

+# Set this to true to enable bfloat16 fastmath gemm kernels on aarch64 platforms with bfloat16 support
+arm64_bfloat16_fastmath_mode=false
+
 average_over=1000
 # CPU takes longer time to run, only run 100 inferences to get average latency.
 if [ "$run_cpu_fp32" = true ] || [ "$run_cpu_int8" = true ]; then
@ -63,7 +66,7 @@ models_to_test="bert-base-cased roberta-base distilbert-base-uncased"
 # export CUDA_VISIBLE_DEVICES=1

 # This script will generate a logs file with a list of commands used in tests.
-echo echo "ort=$run_ort torch=$run_torch torch2=$run_torch2 torchscript=$run_torchscript tensorflow=$run_tensorflow gpu_fp32=$run_gpu_fp32 gpu_fp16=$run_gpu_fp16 cpu=$run_cpu optimizer=$use_optimizer batch=$batch_sizes sequence=$sequence_length models=$models_to_test" >> benchmark.log
+echo echo "ort=$run_ort torch=$run_torch torch2=$run_torch2 torchscript=$run_torchscript tensorflow=$run_tensorflow gpu_fp32=$run_gpu_fp32 gpu_fp16=$run_gpu_fp16 cpu=$run_cpu optimizer=$use_optimizer batch=$batch_sizes sequence=$sequence_length models=$models_to_test" arm64_bfloat16_fastmath_mode=$arm64_bfloat16_fastmath_mode >> benchmark.log

 # Set it to false to skip testing. You can use it to dry run this script with the log file.
 run_tests=true
@ -127,6 +130,10 @@ if [ "$force_layer_number" = true ] ; then
  benchmark_options="$benchmark_options --force_num_layers $layer_number"
 fi

+if [ "$arm64_bfloat16_fastmath_mode" = true ] ; then
+  benchmark_options="$benchmark_options --enable_arm64_bfloat16_fastmath_mlas_gemm"
+fi
+
 # -------------------------------------------
 run_one_test() {
    if [ "$run_ort" = true ] ; then