From a7738b52c5fbeb6f29e33da2421fb3e0ad7dfd7e Mon Sep 17 00:00:00 2001
From: zhangyaobit <1034716+zhangyaobit@users.noreply.github.com>
Date: Tue, 1 Mar 2022 16:00:16 -0800
Subject: [PATCH] Add microbench to benchmark single operators. (#10678)

* Add microbench to benchmark single operators.

* Move to tool directory; seperate data genration from io binding.

* Refector.

* Clean up.

* Use precision instead for extensibility.

* Refactor the create_io_binding function to take in torch tensors
instead of numpy arrays; this reflects more accurately what
the function does, because it is torch tensors that got bound.
---
 .../python/tools/microbench/benchmark.py      |  61 ++++++++++++++++
 onnxruntime/python/tools/microbench/matmul.py |  68 ++++++++++++++++++
 .../tools/microbench/models/matmul_fp16.onnx  | Bin 0 -> 171 bytes
 .../tools/microbench/models/matmul_fp32.onnx  | Bin 0 -> 171 bytes
 4 files changed, 129 insertions(+)
 create mode 100644 onnxruntime/python/tools/microbench/benchmark.py
 create mode 100644 onnxruntime/python/tools/microbench/matmul.py
 create mode 100644 onnxruntime/python/tools/microbench/models/matmul_fp16.onnx
 create mode 100644 onnxruntime/python/tools/microbench/models/matmul_fp32.onnx

diff --git a/onnxruntime/python/tools/microbench/benchmark.py b/onnxruntime/python/tools/microbench/benchmark.py
new file mode 100644
index 0000000000..89359782a3
--- /dev/null
+++ b/onnxruntime/python/tools/microbench/benchmark.py
@@ -0,0 +1,61 @@
+import time
+import numpy
+import onnxruntime as ort
+import torch
+
+
+def create_input_output_tensors(inputs, outputs):
+    device = "cuda"
+    input_tensors = {name: torch.from_numpy(array).to(device) for name, array in inputs.items()}
+    output_tensors = {name: torch.from_numpy(array).to(device) for name, array in outputs.items()}
+    return input_tensors, output_tensors
+
+
+def numpy_type(torch_type):
+    type_map = {torch.float32: numpy.float32,
+                torch.float16: numpy.float16}
+    return type_map[torch_type]
+
+
+def create_io_binding(sess, input_tensors, output_tensors):
+    io_binding = sess.io_binding()
+
+    for name, tensor in input_tensors.items():
+        io_binding.bind_input(name, tensor.device.type, 0, numpy_type(tensor.dtype), tensor.shape, tensor.data_ptr())
+
+    for name, tensor in output_tensors.items():
+        io_binding.bind_output(name, tensor.device.type, 0, numpy_type(tensor.dtype), tensor.shape, tensor.data_ptr())
+  
+    return io_binding
+
+
+def create_session(onnx_file, provider, profiling):
+    sess_opt = ort.SessionOptions()
+    sess_opt.enable_profiling = profiling
+    if provider == "rocm":
+        execution_provider = ["ROCMExecutionProvider"] 
+    else:
+        execution_provider = ["CUDAExecutionProvider"] 
+        
+    sess = ort.InferenceSession(onnx_file, sess_options=sess_opt, providers=execution_provider)
+    return sess
+ 
+
+def benchmark(onnx_file, inputs, outputs, provider, profiling=False):
+    sess = create_session(onnx_file, provider, profiling)
+    input_tensors, output_tensors = create_input_output_tensors(inputs, outputs)
+    io_binding = create_io_binding(sess, input_tensors, output_tensors)
+
+    # warm up    
+    for iter in range(10):
+      sess.run_with_iobinding(io_binding)    
+    
+    # measure 
+    max_iters = 100
+    start_time = time.time()
+    for iter in range(max_iters):
+        sess.run_with_iobinding(io_binding)    
+    
+    # time is in milliseconds
+    elapsed_time = (time.time() - start_time) * 1000 / max_iters
+    return elapsed_time
diff --git a/onnxruntime/python/tools/microbench/matmul.py b/onnxruntime/python/tools/microbench/matmul.py
new file mode 100644
index 0000000000..b7f11f0896
--- /dev/null
+++ b/onnxruntime/python/tools/microbench/matmul.py
@@ -0,0 +1,68 @@
+import numpy as np
+from benchmark import benchmark 
+
+
+def create_inputs_outputs(b1, b2, m, k, n, data_type):
+    np.random.seed(0)
+    a = np.random.rand(b1, b2, m, k).astype(data_type)
+    b = np.random.rand(b1, b2, k, n).astype(data_type)
+    c = np.random.rand(b1, b2, m, n).astype(data_type)
+
+    inputs = {"A": a, "B": b}
+    outputs = {"return_val": c}
+ 
+    return inputs, outputs
+
+
+def add_benchmark_case(benchmark_cases, batch_size, seq_len, hidden_size, intermediate_dimension, num_heads, data_type, model):
+    benchmark_cases += [
+        (1, batch_size, seq_len, hidden_size, hidden_size, data_type, model),
+        (1, batch_size, seq_len, intermediate_dimension, hidden_size, data_type, model),
+        (1, batch_size, seq_len, hidden_size, intermediate_dimension, data_type, model),
+        (batch_size, num_heads, seq_len, seq_len, int(hidden_size / num_heads), data_type, model),
+    ]
+
+
+def create_benchmark_cases(precision="fp16"):
+    benchmark_cases = []
+    if precision == "fp16":
+      model = "models/matmul_fp16.onnx"
+      data_type = np.float16
+    else:
+      model = "models/matmul_fp32.onnx"
+      data_type = np.float32
+
+    # bert-large
+    hidden_size = 1024
+    seq_len = 384
+    num_heads = 16
+    batch_size = 1
+    intermediate_dimension = hidden_size * 4
+    add_benchmark_case(benchmark_cases, batch_size, seq_len, hidden_size, intermediate_dimension, num_heads, data_type, model)
+
+    # bert-base
+    hidden_size = 768
+    seq_len = 384
+    num_heads = 12
+    batch_size = 1
+    intermediate_dimension = hidden_size * 4
+    add_benchmark_case(benchmark_cases, batch_size, seq_len, hidden_size, intermediate_dimension, num_heads, data_type, model)
+
+    return benchmark_cases
+
+
+def benchmark_matmul(b1, b2, m, k, n, data_type, onnx_file):
+    inputs, outputs = create_inputs_outputs(b1, b2, m, k, n, data_type)
+    time = benchmark(onnx_file, inputs, outputs, "rocm")
+    return time
+
+
+def main():
+    for (b1, b2, m, k, n, data_type, onnx_file) in create_benchmark_cases(): 
+        time = benchmark_matmul(b1, b2, m, k, n, data_type, onnx_file)
+        tflops = b1 * b2 * m * k * n * 2 / time / 1000000000
+        print(f"(b1 b2 m k n) = ({b1} {b2} {m} {k} {n}), {time:7.4f} ms, {tflops:4.2f} tflops")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/onnxruntime/python/tools/microbench/models/matmul_fp16.onnx b/onnxruntime/python/tools/microbench/models/matmul_fp16.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..d227b1abd3af072a66ee5f8c0e8507d1a583df70
GIT binary patch
literal 171
zcmd<!5MnMc%D0-$$R)+a=*Y$BB*ayeT2fk+7hjf`qr~Q$SmIlnBgCDXSdv?s6Q5RK
zXcnaiR463FCB?xdB*w)e#FS(RCXKk5g&1=|L^eo?6D}p$ASHQ8T3kpL;Zh4WF3*XD
Ii$Q=N08l?2`Tzg`

literal 0
HcmV?d00001

diff --git a/onnxruntime/python/tools/microbench/models/matmul_fp32.onnx b/onnxruntime/python/tools/microbench/models/matmul_fp32.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..33584205d96552ca77e6498b76c9f70922960870
GIT binary patch
literal 171
zcmd<!5MnMc%D0-$$R)+a=*Y$BB*ayeT2fk+7hjf`qr~Q$SmIlnBgCDXSdv?s6Q5RK
zY!sykR463FCB?xgB*w)e#FS(RCXKk5g&1=|L^eo?6D}p$ASHQ8T3kpL;Zh4WF3*XD
Ii$Q=N07thS-2eap

literal 0
HcmV?d00001