Add micro-benchmarks for Attention and SkipLayerNormalization ops. (#10798)

* Add micro-benchmarks for Attention and SkipLayerNormalization ops. * Add choices for argument provider and precision. * Automatically select CUDA or ROCM execution provider.
2026-06-07 00:13:17 +00:00 · 2022-03-09 18:18:51 -08:00 · 2022-03-09 18:18:51 -08:00 · 9cbcc93e03
commit 9cbcc93e03
parent 1c313f4476
7 changed files with 309 additions and 25 deletions
--- a/onnxruntime/python/tools/microbench/attention.py
+++ b/onnxruntime/python/tools/microbench/attention.py
@ -0,0 +1,52 @@
+import argparse
+from dataclasses import dataclass
+import numpy as np
+from benchmark import BenchmarkOp, add_arguments
+
+
+@dataclass
+class OpParam:
+    batch_size: int
+    seq_len: int
+    hidden_size: int
+    length: int
+    data_type: type
+
+
+class BenchmarkAttention(BenchmarkOp):
+    def __init__(self, args):
+        BenchmarkOp.__init__(self, args)
+
+    def create_inputs_outputs(cls, op_param):
+        np.random.seed(0)
+        input_data = np.random.rand(op_param.batch_size, op_param.seq_len, op_param.hidden_size).astype(op_param.data_type)
+        weight = np.random.rand(op_param.hidden_size, op_param.length).astype(op_param.data_type)
+        bias = np.random.rand(op_param.length).astype(op_param.data_type)
+        mask_index = np.random.rand(op_param.batch_size, op_param.seq_len).astype(np.int32)
+        output_data = np.random.rand(op_param.batch_size, op_param.seq_len, op_param.hidden_size).astype(op_param.data_type)
+        inputs = {"INPUT": input_data, "WEIGHT": weight, "BIAS": bias, "MASK_INDEX": mask_index}
+        outputs = {"return_val": output_data}
+        return inputs, outputs
+
+    def create_cases(self):
+        model = "models/attention_fp16.onnx" if self.args.precision == "fp16" else "models/attention_fp32.onnx"
+        data_type = np.float16 if self.args.precision == "fp16" else np.float32
+        # bert-base
+        op_param = OpParam(1, 384, 768, 768 * 3, data_type)
+        self.add_case(op_param, model)
+
+    def case_profile(cls, op_param, time):
+        profile = f"(batch_size seq_len length) = ({op_param.batch_size} {op_param.seq_len} {op_param.length}), {time:7.4f} ms"
+        return profile
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    add_arguments(parser)
+    args = parser.parse_args()
+    bm = BenchmarkAttention(args)
+    bm.benchmark()
+
+
+if __name__ == "__main__":
+    main()
--- a/onnxruntime/python/tools/microbench/benchmark.py
+++ b/onnxruntime/python/tools/microbench/benchmark.py
@ -1,63 +1,89 @@
 from abc import ABC, abstractmethod
 from argparse import ArgumentParser
-import time
+import logging
 import numpy
 import onnxruntime as ort
+import time
 import torch

+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+

 def numpy_type(torch_type):
    type_map = {torch.float32: numpy.float32,
-                torch.float16: numpy.float16}
+                torch.float16: numpy.float16,
+                torch.int32: numpy.int32}
    return type_map[torch_type]


 def add_arguments(parser: ArgumentParser):
-    parser.add_argument("--provider", required=False, type=str, default="rocm", help="Execution provider to use")
-    parser.add_argument("--precision", required=False, type=str, default="fp16", help="Number format to use")
-    parser.add_argument('--profiling', type=bool, default=False, help='If enable profiling')
+    parser.add_argument("--provider", required=False, type=str,
+                        choices=["cuda", "rocm", "cpu", None], default=None,
+                        help=("Execution provider to use. By default, a "
+                              "provider is selected in the priority order "
+                              "(cuda|rocm, cpu) depending on availability."))
+    parser.add_argument("--precision", required=False, type=str,
+                        choices=["fp16", "fp32"], default="fp16",
+                        help="Number format to use")
+    parser.add_argument('--profiling', required=False, type=bool,
+                        default=False, help='If enable profiling')
+
+
+def provider_name(name):
+    provider_map = {"cuda": "CUDAExecutionProvider",
+                    "rocm": "ROCMExecutionProvider",
+                    "cpu": "CPUExecutionProvider"}
+    return provider_map[name]
+
+
+def get_default_provider():
+    if "CUDAExecutionProvider" in ort.get_available_providers():
+        return "CUDAExecutionProvider"
+    if "ROCMExecutionProvider" in ort.get_available_providers():
+        return "ROCMExecutionProvider"
+    return "CPUExecutionProvider"


 class Benchmark:
    def __init__(self, model, inputs, outputs, args):
-        self.provider = args.provider
+        self.provider = (get_default_provider() if args.provider == None
+                         else provider_name(args.provider))
+        logger.info(f"Execution provider: {self.provider}")
        self.profiling = args.profiling
        self.model = model
+        logger.info(f"Model: {self.model}")
        self.inputs = inputs
        self.outputs = outputs

    def create_input_output_tensors(self):
-        device = "cuda"
-        input_tensors = {name: torch.from_numpy(array).to(device) for name, array in self.inputs.items()}
-        output_tensors = {name: torch.from_numpy(array).to(device) for name, array in self.outputs.items()}
+        on_gpu = (self.provider == "CUDAExecutionProvider" 
+                  or self.provider == "ROCMExecutionProvider")
+        device = "cuda" if on_gpu else "cpu"
+        input_tensors = {name: torch.from_numpy(array).to(device)
+                         for name, array in self.inputs.items()}
+        output_tensors = {name: torch.from_numpy(array).to(device)
+                          for name, array in self.outputs.items()}
        return input_tensors, output_tensors

    @classmethod
    def create_io_binding(cls, sess, input_tensors, output_tensors):
        io_binding = sess.io_binding()
        for name, tensor in input_tensors.items():
-            io_binding.bind_input(name, tensor.device.type, 0, numpy_type(tensor.dtype), tensor.shape, tensor.data_ptr())
+            io_binding.bind_input(name, tensor.device.type, 0,
+                                  numpy_type(tensor.dtype), tensor.shape,
+                                  tensor.data_ptr())
        for name, tensor in output_tensors.items():
-            io_binding.bind_output(name, tensor.device.type, 0, numpy_type(tensor.dtype), tensor.shape, tensor.data_ptr())
+            io_binding.bind_output(name, tensor.device.type, 0,
+                                   numpy_type(tensor.dtype), tensor.shape,
+                                   tensor.data_ptr())
        return io_binding

    def create_session(self):
        sess_opt = ort.SessionOptions()
        sess_opt.enable_profiling = self.profiling
-        if self.provider == "rocm":
-            execution_provider = ["ROCMExecutionProvider"]
-        elif self.provider == "cuda":
-            execution_provider = ["CUDAExecutionProvider"]
-        else:
-            raise ValueError(f"The script doesn't support provider type '{self.provider}' yet.")
-
-        sess = ort.InferenceSession(self.model, sess_options=sess_opt, providers=execution_provider)
-
-        if self.provider == "rocm":
-            assert 'ROCMExecutionProvider' in sess.get_providers()
-        elif self.provider == "cuda":
-            assert 'CUDAExecutionProvider' in sess.get_providers()
-
+        sess = ort.InferenceSession(self.model, sess_options=sess_opt,
+                                    providers=[self.provider])
        return sess

    def benchmark(self):
--- a/onnxruntime/python/tools/microbench/models/attention_fp16.onnx
+++ b/onnxruntime/python/tools/microbench/models/attention_fp16.onnx
@ -0,0 +1,40 @@
+p2o:Ë
+Y
+INPUT
+WEIGHT
+BIAS
+
+MASK_INDEX
+return_val"	Attention*
+	num_heads :
com.microsoft	attentionZ7
+INPUT.
+,
+(
+
+batch_size
+	seq_len
+
hidden_sizeZ&
+WEIGHT
+
+
+
hidden_size
+lenZ
+BIAS
+
+
+lenZ-
+
+MASK_INDEX
+
+
+batch_size
+	seq_lenb<
+
+return_val.
+,
+(
+
+batch_size
+	seq_len
+
hidden_sizeB
+
com.microsoft
--- a/onnxruntime/python/tools/microbench/models/attention_fp32.onnx
+++ b/onnxruntime/python/tools/microbench/models/attention_fp32.onnx
@ -0,0 +1,36 @@
+p2o:Ë
+Y
+INPUT
+WEIGHT
+BIAS
+
+MASK_INDEX
+return_val"	Attention*
+	num_heads :
com.microsoft	attentionZ7
+INPUT.
+,(
+
+batch_size
+	seq_len
+
hidden_sizeZ&
+WEIGHT
+
+
hidden_size
+lenZ
+BIAS
+
+lenZ-
+
+MASK_INDEX
+
+
+batch_size
+	seq_lenb<
+
+return_val.
+,(
+
+batch_size
+	seq_len
+
hidden_sizeB
+
com.microsoft
--- a/onnxruntime/python/tools/microbench/models/skip_layer_norm_fp16.onnx
+++ b/onnxruntime/python/tools/microbench/models/skip_layer_norm_fp16.onnx
@ -0,0 +1,41 @@
+p2o:þ
+f
+INPUT
+SKIP
+GAMMA
+BETA
+BIAS
+return_val"SkipLayerNormalization*
+epsilonoƒ: :
com.microsoftskip_layer_normalizationZ2
+INPUT)
+'
+#
+batch
+	seq_len
+
hidden_sizeZ1
+SKIP)
+'
+#
+batch
+	seq_len
+
hidden_sizeZ
+GAMMA
+
+
+
hidden_sizeZ
+BETA
+
+
+
hidden_sizeZ
+BIAS
+
+
+
hidden_sizeb5
+
+return_val'
+%
+!
+batch
+	seq_len
+	inter_dimB
+
com.microsoft
--- a/onnxruntime/python/tools/microbench/models/skip_layer_norm_fp32.onnx
+++ b/onnxruntime/python/tools/microbench/models/skip_layer_norm_fp32.onnx
@ -0,0 +1,35 @@
+p2o:þ
+f
+INPUT
+SKIP
+GAMMA
+BETA
+BIAS
+return_val"SkipLayerNormalization*
+epsilonoƒ: :
com.microsoftskip_layer_normalizationZ2
+INPUT)
+'#
+batch
+	seq_len
+
hidden_sizeZ1
+SKIP)
+'#
+batch
+	seq_len
+
hidden_sizeZ
+GAMMA
+
+
hidden_sizeZ
+BETA
+
+
hidden_sizeZ
+BIAS
+
+
hidden_sizeb5
+
+return_val'
+%!
+batch
+	seq_len
+	inter_dimB
+
com.microsoft
--- a/onnxruntime/python/tools/microbench/skip_layer_norm.py
+++ b/onnxruntime/python/tools/microbench/skip_layer_norm.py
@ -0,0 +1,54 @@
+import argparse
+from dataclasses import dataclass
+import numpy as np
+from benchmark import BenchmarkOp, add_arguments
+
+
+@dataclass
+class OpParam:
+    batch_size: int
+    seq_len: int
+    hidden_size: int
+    data_type: type
+
+
+class BenchmarkSkipLayerNorm(BenchmarkOp):
+    def __init__(self, args):
+        BenchmarkOp.__init__(self, args)
+
+    def create_inputs_outputs(cls, op_param):
+        np.random.seed(0)
+        input_data = np.random.rand(op_param.batch_size, op_param.seq_len, op_param.hidden_size).astype(op_param.data_type)
+        skip = np.random.rand(op_param.batch_size, op_param.seq_len, op_param.hidden_size).astype(op_param.data_type)
+        gamma = np.random.rand(op_param.hidden_size).astype(op_param.data_type)
+        beta = np.random.rand(op_param.hidden_size).astype(op_param.data_type)
+        bias = np.random.rand(op_param.hidden_size).astype(op_param.data_type)
+        output_data = np.random.rand(op_param.batch_size, op_param.seq_len, op_param.hidden_size).astype(op_param.data_type)
+    
+        inputs = {"INPUT": input_data, "SKIP": skip, "GAMMA": gamma, "BETA": beta, "BIAS": bias}
+        outputs = {"return_val": output_data}
+     
+        return inputs, outputs
+
+    def create_cases(self):
+        model = "models/skip_layer_norm_fp16.onnx" if self.args.precision == "fp16" else "models/skip_layer_norm_fp32.onnx"
+        data_type = np.float16 if self.args.precision == "fp16" else np.float32
+        # bert-large
+        op_param = OpParam(1, 384, 1024, data_type)
+        self.add_case(op_param, model)
+
+    def case_profile(cls, op_param, time):
+        profile = f"(batch seq_len hidden_size) = ({op_param.batch_size} {op_param.seq_len} {op_param.hidden_size}), {time:7.4f} ms"
+        return profile
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    add_arguments(parser)
+    args = parser.parse_args()
+    bm = BenchmarkSkipLayerNorm(args)
+    bm.benchmark()
+
+
+if __name__ == "__main__":
+    main()