mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-29 23:06:41 +00:00
add arm64 bfloat16 fastmath mode option for transformers benchmarking script (#19294)
Add arm64 bfloat16 fastmath mode option for transformers benchmarking script. ### Motivation and Context onnxruntime now supports bfloat16 fastmath gemm kernels for arm64 platforms with bfloat16 instruction support. This PR updates benchmark scripts to test that mode.
This commit is contained in:
parent
90e2e8561f
commit
7fa6f4fca4
3 changed files with 25 additions and 1 deletions
|
|
@ -36,6 +36,8 @@
|
|||
python benchmark.py -e torchscript onnxruntime -p "int8" -o
|
||||
Run OnnxRuntime with the ROCM provider and graph optimization script:
|
||||
python benchmark.py -g -m bert-base-cased --provider rocm --optimizer_info by_script --disable_embed_layer_norm
|
||||
Run OnnxRuntime with bfloat16 fastmath mode kernels on aarch64 platforms with bfloat16 support:
|
||||
python benchmark.py --enable_arm64_bfloat16_fastmath_mlas_gemm
|
||||
|
||||
It is recommended to use run_benchmark.sh to launch benchmark.
|
||||
"""
|
||||
|
|
@ -106,6 +108,7 @@ def run_onnxruntime(
|
|||
use_raw_attention_mask,
|
||||
model_fusion_statistics,
|
||||
model_source,
|
||||
enable_arm64_bfloat16_fastmath_mlas_gemm,
|
||||
args,
|
||||
):
|
||||
import onnxruntime
|
||||
|
|
@ -209,6 +212,7 @@ def run_onnxruntime(
|
|||
enable_all_optimization=True,
|
||||
num_threads=num_threads,
|
||||
verbose=verbose,
|
||||
enable_mlas_gemm_fastmath_arm64_bfloat16=enable_arm64_bfloat16_fastmath_mlas_gemm,
|
||||
)
|
||||
if ort_session is None:
|
||||
continue
|
||||
|
|
@ -760,6 +764,14 @@ def parse_arguments():
|
|||
help="Manually set the model's layer number",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--enable_arm64_bfloat16_fastmath_mlas_gemm",
|
||||
required=False,
|
||||
action="store_true",
|
||||
help="Enable bfloat16 mlas gemm kernels on aarch64. Supported only for CPU EP ",
|
||||
)
|
||||
parser.set_defaults(enable_arm64_bfloat16_fastmath_mlas_gemm=False)
|
||||
|
||||
FusionOptions.add_arguments(parser)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
|
@ -905,6 +917,7 @@ def main():
|
|||
use_raw_attention_mask,
|
||||
model_fusion_statistics,
|
||||
args.model_source,
|
||||
args.enable_arm64_bfloat16_fastmath_mlas_gemm,
|
||||
args,
|
||||
)
|
||||
except Exception:
|
||||
|
|
|
|||
|
|
@ -85,6 +85,7 @@ def create_onnxruntime_session(
|
|||
num_threads=-1,
|
||||
enable_profiling=False,
|
||||
verbose=False,
|
||||
enable_mlas_gemm_fastmath_arm64_bfloat16=False,
|
||||
provider_options={}, # map execution provider name to its option # noqa: B006
|
||||
):
|
||||
session = None
|
||||
|
|
@ -136,6 +137,9 @@ def create_onnxruntime_session(
|
|||
if provider_options:
|
||||
providers = [(name, provider_options[name]) if name in provider_options else name for name in providers]
|
||||
|
||||
if enable_mlas_gemm_fastmath_arm64_bfloat16:
|
||||
sess_options.add_session_config_entry("mlas.enable_gemm_fastmath_arm64_bfloat16", "1")
|
||||
|
||||
session = onnxruntime.InferenceSession(onnx_model_path, sess_options, providers=providers)
|
||||
except Exception:
|
||||
logger.error("Exception", exc_info=True)
|
||||
|
|
|
|||
9
onnxruntime/python/tools/transformers/run_benchmark.sh
Normal file → Executable file
9
onnxruntime/python/tools/transformers/run_benchmark.sh
Normal file → Executable file
|
|
@ -34,6 +34,9 @@ run_gpu_fp16=true
|
|||
run_cpu_fp32=false
|
||||
run_cpu_int8=false
|
||||
|
||||
# Set this to true to enable bfloat16 fastmath gemm kernels on aarch64 platforms with bfloat16 support
|
||||
arm64_bfloat16_fastmath_mode=false
|
||||
|
||||
average_over=1000
|
||||
# CPU takes longer time to run, only run 100 inferences to get average latency.
|
||||
if [ "$run_cpu_fp32" = true ] || [ "$run_cpu_int8" = true ]; then
|
||||
|
|
@ -63,7 +66,7 @@ models_to_test="bert-base-cased roberta-base distilbert-base-uncased"
|
|||
# export CUDA_VISIBLE_DEVICES=1
|
||||
|
||||
# This script will generate a logs file with a list of commands used in tests.
|
||||
echo echo "ort=$run_ort torch=$run_torch torch2=$run_torch2 torchscript=$run_torchscript tensorflow=$run_tensorflow gpu_fp32=$run_gpu_fp32 gpu_fp16=$run_gpu_fp16 cpu=$run_cpu optimizer=$use_optimizer batch=$batch_sizes sequence=$sequence_length models=$models_to_test" >> benchmark.log
|
||||
echo echo "ort=$run_ort torch=$run_torch torch2=$run_torch2 torchscript=$run_torchscript tensorflow=$run_tensorflow gpu_fp32=$run_gpu_fp32 gpu_fp16=$run_gpu_fp16 cpu=$run_cpu optimizer=$use_optimizer batch=$batch_sizes sequence=$sequence_length models=$models_to_test" arm64_bfloat16_fastmath_mode=$arm64_bfloat16_fastmath_mode >> benchmark.log
|
||||
|
||||
# Set it to false to skip testing. You can use it to dry run this script with the log file.
|
||||
run_tests=true
|
||||
|
|
@ -127,6 +130,10 @@ if [ "$force_layer_number" = true ] ; then
|
|||
benchmark_options="$benchmark_options --force_num_layers $layer_number"
|
||||
fi
|
||||
|
||||
if [ "$arm64_bfloat16_fastmath_mode" = true ] ; then
|
||||
benchmark_options="$benchmark_options --enable_arm64_bfloat16_fastmath_mlas_gemm"
|
||||
fi
|
||||
|
||||
# -------------------------------------------
|
||||
run_one_test() {
|
||||
if [ "$run_ort" = true ] ; then
|
||||
|
|
|
|||
Loading…
Reference in a new issue