From 7fa6f4fca4a05ac025995da70b7a5fa92ee46d83 Mon Sep 17 00:00:00 2001 From: snadampal <87143774+snadampal@users.noreply.github.com> Date: Mon, 12 Feb 2024 17:20:36 -0600 Subject: [PATCH] add arm64 bfloat16 fastmath mode option for transformers benchmarking script (#19294) Add arm64 bfloat16 fastmath mode option for transformers benchmarking script. ### Motivation and Context onnxruntime now supports bfloat16 fastmath gemm kernels for arm64 platforms with bfloat16 instruction support. This PR updates benchmark scripts to test that mode. --- onnxruntime/python/tools/transformers/benchmark.py | 13 +++++++++++++ .../python/tools/transformers/benchmark_helper.py | 4 ++++ .../python/tools/transformers/run_benchmark.sh | 9 ++++++++- 3 files changed, 25 insertions(+), 1 deletion(-) mode change 100644 => 100755 onnxruntime/python/tools/transformers/run_benchmark.sh diff --git a/onnxruntime/python/tools/transformers/benchmark.py b/onnxruntime/python/tools/transformers/benchmark.py index 22f2cfb8a0..89f9947688 100644 --- a/onnxruntime/python/tools/transformers/benchmark.py +++ b/onnxruntime/python/tools/transformers/benchmark.py @@ -36,6 +36,8 @@ python benchmark.py -e torchscript onnxruntime -p "int8" -o Run OnnxRuntime with the ROCM provider and graph optimization script: python benchmark.py -g -m bert-base-cased --provider rocm --optimizer_info by_script --disable_embed_layer_norm + Run OnnxRuntime with bfloat16 fastmath mode kernels on aarch64 platforms with bfloat16 support: + python benchmark.py --enable_arm64_bfloat16_fastmath_mlas_gemm It is recommended to use run_benchmark.sh to launch benchmark. """ @@ -106,6 +108,7 @@ def run_onnxruntime( use_raw_attention_mask, model_fusion_statistics, model_source, + enable_arm64_bfloat16_fastmath_mlas_gemm, args, ): import onnxruntime @@ -209,6 +212,7 @@ def run_onnxruntime( enable_all_optimization=True, num_threads=num_threads, verbose=verbose, + enable_mlas_gemm_fastmath_arm64_bfloat16=enable_arm64_bfloat16_fastmath_mlas_gemm, ) if ort_session is None: continue @@ -760,6 +764,14 @@ def parse_arguments(): help="Manually set the model's layer number", ) + parser.add_argument( + "--enable_arm64_bfloat16_fastmath_mlas_gemm", + required=False, + action="store_true", + help="Enable bfloat16 mlas gemm kernels on aarch64. Supported only for CPU EP ", + ) + parser.set_defaults(enable_arm64_bfloat16_fastmath_mlas_gemm=False) + FusionOptions.add_arguments(parser) args = parser.parse_args() @@ -905,6 +917,7 @@ def main(): use_raw_attention_mask, model_fusion_statistics, args.model_source, + args.enable_arm64_bfloat16_fastmath_mlas_gemm, args, ) except Exception: diff --git a/onnxruntime/python/tools/transformers/benchmark_helper.py b/onnxruntime/python/tools/transformers/benchmark_helper.py index c5edae4259..c7d93470a7 100644 --- a/onnxruntime/python/tools/transformers/benchmark_helper.py +++ b/onnxruntime/python/tools/transformers/benchmark_helper.py @@ -85,6 +85,7 @@ def create_onnxruntime_session( num_threads=-1, enable_profiling=False, verbose=False, + enable_mlas_gemm_fastmath_arm64_bfloat16=False, provider_options={}, # map execution provider name to its option # noqa: B006 ): session = None @@ -136,6 +137,9 @@ def create_onnxruntime_session( if provider_options: providers = [(name, provider_options[name]) if name in provider_options else name for name in providers] + if enable_mlas_gemm_fastmath_arm64_bfloat16: + sess_options.add_session_config_entry("mlas.enable_gemm_fastmath_arm64_bfloat16", "1") + session = onnxruntime.InferenceSession(onnx_model_path, sess_options, providers=providers) except Exception: logger.error("Exception", exc_info=True) diff --git a/onnxruntime/python/tools/transformers/run_benchmark.sh b/onnxruntime/python/tools/transformers/run_benchmark.sh old mode 100644 new mode 100755 index f0422839c1..64d6ecde61 --- a/onnxruntime/python/tools/transformers/run_benchmark.sh +++ b/onnxruntime/python/tools/transformers/run_benchmark.sh @@ -34,6 +34,9 @@ run_gpu_fp16=true run_cpu_fp32=false run_cpu_int8=false +# Set this to true to enable bfloat16 fastmath gemm kernels on aarch64 platforms with bfloat16 support +arm64_bfloat16_fastmath_mode=false + average_over=1000 # CPU takes longer time to run, only run 100 inferences to get average latency. if [ "$run_cpu_fp32" = true ] || [ "$run_cpu_int8" = true ]; then @@ -63,7 +66,7 @@ models_to_test="bert-base-cased roberta-base distilbert-base-uncased" # export CUDA_VISIBLE_DEVICES=1 # This script will generate a logs file with a list of commands used in tests. -echo echo "ort=$run_ort torch=$run_torch torch2=$run_torch2 torchscript=$run_torchscript tensorflow=$run_tensorflow gpu_fp32=$run_gpu_fp32 gpu_fp16=$run_gpu_fp16 cpu=$run_cpu optimizer=$use_optimizer batch=$batch_sizes sequence=$sequence_length models=$models_to_test" >> benchmark.log +echo echo "ort=$run_ort torch=$run_torch torch2=$run_torch2 torchscript=$run_torchscript tensorflow=$run_tensorflow gpu_fp32=$run_gpu_fp32 gpu_fp16=$run_gpu_fp16 cpu=$run_cpu optimizer=$use_optimizer batch=$batch_sizes sequence=$sequence_length models=$models_to_test" arm64_bfloat16_fastmath_mode=$arm64_bfloat16_fastmath_mode >> benchmark.log # Set it to false to skip testing. You can use it to dry run this script with the log file. run_tests=true @@ -127,6 +130,10 @@ if [ "$force_layer_number" = true ] ; then benchmark_options="$benchmark_options --force_num_layers $layer_number" fi +if [ "$arm64_bfloat16_fastmath_mode" = true ] ; then + benchmark_options="$benchmark_options --enable_arm64_bfloat16_fastmath_mlas_gemm" +fi + # ------------------------------------------- run_one_test() { if [ "$run_ort" = true ] ; then