Add microbench to benchmark single operators. (#10678)

* Add microbench to benchmark single operators. * Move to tool directory; seperate data genration from io binding. * Refector. * Clean up. * Use precision instead for extensibility. * Refactor the create_io_binding function to take in torch tensors instead of numpy arrays; this reflects more accurately what the function does, because it is torch tensors that got bound.
2026-07-10 17:37:14 +00:00 · 2022-03-01 16:00:16 -08:00 · 2022-03-01 16:00:16 -08:00 · a7738b52c5
commit a7738b52c5
parent 19464614e7
4 changed files with 129 additions and 0 deletions
--- a/onnxruntime/python/tools/microbench/benchmark.py
+++ b/onnxruntime/python/tools/microbench/benchmark.py
@ -0,0 +1,61 @@
+import time
+import numpy
+import onnxruntime as ort
+import torch
+
+
+def create_input_output_tensors(inputs, outputs):
+    device = "cuda"
+    input_tensors = {name: torch.from_numpy(array).to(device) for name, array in inputs.items()}
+    output_tensors = {name: torch.from_numpy(array).to(device) for name, array in outputs.items()}
+    return input_tensors, output_tensors
+
+
+def numpy_type(torch_type):
+    type_map = {torch.float32: numpy.float32,
+                torch.float16: numpy.float16}
+    return type_map[torch_type]
+
+
+def create_io_binding(sess, input_tensors, output_tensors):
+    io_binding = sess.io_binding()
+
+    for name, tensor in input_tensors.items():
+        io_binding.bind_input(name, tensor.device.type, 0, numpy_type(tensor.dtype), tensor.shape, tensor.data_ptr())
+
+    for name, tensor in output_tensors.items():
+        io_binding.bind_output(name, tensor.device.type, 0, numpy_type(tensor.dtype), tensor.shape, tensor.data_ptr())
+  
+    return io_binding
+
+
+def create_session(onnx_file, provider, profiling):
+    sess_opt = ort.SessionOptions()
+    sess_opt.enable_profiling = profiling
+    if provider == "rocm":
+        execution_provider = ["ROCMExecutionProvider"] 
+    else:
+        execution_provider = ["CUDAExecutionProvider"] 
+        
+    sess = ort.InferenceSession(onnx_file, sess_options=sess_opt, providers=execution_provider)
+    return sess
+ 
+
+def benchmark(onnx_file, inputs, outputs, provider, profiling=False):
+    sess = create_session(onnx_file, provider, profiling)
+    input_tensors, output_tensors = create_input_output_tensors(inputs, outputs)
+    io_binding = create_io_binding(sess, input_tensors, output_tensors)
+
+    # warm up    
+    for iter in range(10):
+      sess.run_with_iobinding(io_binding)    
+    
+    # measure 
+    max_iters = 100
+    start_time = time.time()
+    for iter in range(max_iters):
+        sess.run_with_iobinding(io_binding)    
+    
+    # time is in milliseconds
+    elapsed_time = (time.time() - start_time) * 1000 / max_iters
+    return elapsed_time
--- a/onnxruntime/python/tools/microbench/matmul.py
+++ b/onnxruntime/python/tools/microbench/matmul.py
@ -0,0 +1,68 @@
+import numpy as np
+from benchmark import benchmark 
+
+
+def create_inputs_outputs(b1, b2, m, k, n, data_type):
+    np.random.seed(0)
+    a = np.random.rand(b1, b2, m, k).astype(data_type)
+    b = np.random.rand(b1, b2, k, n).astype(data_type)
+    c = np.random.rand(b1, b2, m, n).astype(data_type)
+
+    inputs = {"A": a, "B": b}
+    outputs = {"return_val": c}
+ 
+    return inputs, outputs
+
+
+def add_benchmark_case(benchmark_cases, batch_size, seq_len, hidden_size, intermediate_dimension, num_heads, data_type, model):
+    benchmark_cases += [
+        (1, batch_size, seq_len, hidden_size, hidden_size, data_type, model),
+        (1, batch_size, seq_len, intermediate_dimension, hidden_size, data_type, model),
+        (1, batch_size, seq_len, hidden_size, intermediate_dimension, data_type, model),
+        (batch_size, num_heads, seq_len, seq_len, int(hidden_size / num_heads), data_type, model),
+    ]
+
+
+def create_benchmark_cases(precision="fp16"):
+    benchmark_cases = []
+    if precision == "fp16":
+      model = "models/matmul_fp16.onnx"
+      data_type = np.float16
+    else:
+      model = "models/matmul_fp32.onnx"
+      data_type = np.float32
+
+    # bert-large
+    hidden_size = 1024
+    seq_len = 384
+    num_heads = 16
+    batch_size = 1
+    intermediate_dimension = hidden_size * 4
+    add_benchmark_case(benchmark_cases, batch_size, seq_len, hidden_size, intermediate_dimension, num_heads, data_type, model)
+
+    # bert-base
+    hidden_size = 768
+    seq_len = 384
+    num_heads = 12
+    batch_size = 1
+    intermediate_dimension = hidden_size * 4
+    add_benchmark_case(benchmark_cases, batch_size, seq_len, hidden_size, intermediate_dimension, num_heads, data_type, model)
+
+    return benchmark_cases
+
+
+def benchmark_matmul(b1, b2, m, k, n, data_type, onnx_file):
+    inputs, outputs = create_inputs_outputs(b1, b2, m, k, n, data_type)
+    time = benchmark(onnx_file, inputs, outputs, "rocm")
+    return time
+
+
+def main():
+    for (b1, b2, m, k, n, data_type, onnx_file) in create_benchmark_cases(): 
+        time = benchmark_matmul(b1, b2, m, k, n, data_type, onnx_file)
+        tflops = b1 * b2 * m * k * n * 2 / time / 1000000000
+        print(f"(b1 b2 m k n) = ({b1} {b2} {m} {k} {n}), {time:7.4f} ms, {tflops:4.2f} tflops")
+
+
+if __name__ == "__main__":
+    main()
--- a/onnxruntime/python/tools/microbench/models/matmul_fp16.onnx
+++ b/onnxruntime/python/tools/microbench/models/matmul_fp16.onnx
--- a/onnxruntime/python/tools/microbench/models/matmul_fp32.onnx
+++ b/onnxruntime/python/tools/microbench/models/matmul_fp32.onnx