mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-25 22:26:24 +00:00
Add microbench to benchmark single operators. (#10678)
* Add microbench to benchmark single operators. * Move to tool directory; seperate data genration from io binding. * Refector. * Clean up. * Use precision instead for extensibility. * Refactor the create_io_binding function to take in torch tensors instead of numpy arrays; this reflects more accurately what the function does, because it is torch tensors that got bound.
This commit is contained in:
parent
19464614e7
commit
a7738b52c5
4 changed files with 129 additions and 0 deletions
61
onnxruntime/python/tools/microbench/benchmark.py
Normal file
61
onnxruntime/python/tools/microbench/benchmark.py
Normal file
|
|
@ -0,0 +1,61 @@
|
|||
import time
|
||||
import numpy
|
||||
import onnxruntime as ort
|
||||
import torch
|
||||
|
||||
|
||||
def create_input_output_tensors(inputs, outputs):
|
||||
device = "cuda"
|
||||
input_tensors = {name: torch.from_numpy(array).to(device) for name, array in inputs.items()}
|
||||
output_tensors = {name: torch.from_numpy(array).to(device) for name, array in outputs.items()}
|
||||
return input_tensors, output_tensors
|
||||
|
||||
|
||||
def numpy_type(torch_type):
|
||||
type_map = {torch.float32: numpy.float32,
|
||||
torch.float16: numpy.float16}
|
||||
return type_map[torch_type]
|
||||
|
||||
|
||||
def create_io_binding(sess, input_tensors, output_tensors):
|
||||
io_binding = sess.io_binding()
|
||||
|
||||
for name, tensor in input_tensors.items():
|
||||
io_binding.bind_input(name, tensor.device.type, 0, numpy_type(tensor.dtype), tensor.shape, tensor.data_ptr())
|
||||
|
||||
for name, tensor in output_tensors.items():
|
||||
io_binding.bind_output(name, tensor.device.type, 0, numpy_type(tensor.dtype), tensor.shape, tensor.data_ptr())
|
||||
|
||||
return io_binding
|
||||
|
||||
|
||||
def create_session(onnx_file, provider, profiling):
|
||||
sess_opt = ort.SessionOptions()
|
||||
sess_opt.enable_profiling = profiling
|
||||
if provider == "rocm":
|
||||
execution_provider = ["ROCMExecutionProvider"]
|
||||
else:
|
||||
execution_provider = ["CUDAExecutionProvider"]
|
||||
|
||||
sess = ort.InferenceSession(onnx_file, sess_options=sess_opt, providers=execution_provider)
|
||||
return sess
|
||||
|
||||
|
||||
def benchmark(onnx_file, inputs, outputs, provider, profiling=False):
|
||||
sess = create_session(onnx_file, provider, profiling)
|
||||
input_tensors, output_tensors = create_input_output_tensors(inputs, outputs)
|
||||
io_binding = create_io_binding(sess, input_tensors, output_tensors)
|
||||
|
||||
# warm up
|
||||
for iter in range(10):
|
||||
sess.run_with_iobinding(io_binding)
|
||||
|
||||
# measure
|
||||
max_iters = 100
|
||||
start_time = time.time()
|
||||
for iter in range(max_iters):
|
||||
sess.run_with_iobinding(io_binding)
|
||||
|
||||
# time is in milliseconds
|
||||
elapsed_time = (time.time() - start_time) * 1000 / max_iters
|
||||
return elapsed_time
|
||||
68
onnxruntime/python/tools/microbench/matmul.py
Normal file
68
onnxruntime/python/tools/microbench/matmul.py
Normal file
|
|
@ -0,0 +1,68 @@
|
|||
import numpy as np
|
||||
from benchmark import benchmark
|
||||
|
||||
|
||||
def create_inputs_outputs(b1, b2, m, k, n, data_type):
|
||||
np.random.seed(0)
|
||||
a = np.random.rand(b1, b2, m, k).astype(data_type)
|
||||
b = np.random.rand(b1, b2, k, n).astype(data_type)
|
||||
c = np.random.rand(b1, b2, m, n).astype(data_type)
|
||||
|
||||
inputs = {"A": a, "B": b}
|
||||
outputs = {"return_val": c}
|
||||
|
||||
return inputs, outputs
|
||||
|
||||
|
||||
def add_benchmark_case(benchmark_cases, batch_size, seq_len, hidden_size, intermediate_dimension, num_heads, data_type, model):
|
||||
benchmark_cases += [
|
||||
(1, batch_size, seq_len, hidden_size, hidden_size, data_type, model),
|
||||
(1, batch_size, seq_len, intermediate_dimension, hidden_size, data_type, model),
|
||||
(1, batch_size, seq_len, hidden_size, intermediate_dimension, data_type, model),
|
||||
(batch_size, num_heads, seq_len, seq_len, int(hidden_size / num_heads), data_type, model),
|
||||
]
|
||||
|
||||
|
||||
def create_benchmark_cases(precision="fp16"):
|
||||
benchmark_cases = []
|
||||
if precision == "fp16":
|
||||
model = "models/matmul_fp16.onnx"
|
||||
data_type = np.float16
|
||||
else:
|
||||
model = "models/matmul_fp32.onnx"
|
||||
data_type = np.float32
|
||||
|
||||
# bert-large
|
||||
hidden_size = 1024
|
||||
seq_len = 384
|
||||
num_heads = 16
|
||||
batch_size = 1
|
||||
intermediate_dimension = hidden_size * 4
|
||||
add_benchmark_case(benchmark_cases, batch_size, seq_len, hidden_size, intermediate_dimension, num_heads, data_type, model)
|
||||
|
||||
# bert-base
|
||||
hidden_size = 768
|
||||
seq_len = 384
|
||||
num_heads = 12
|
||||
batch_size = 1
|
||||
intermediate_dimension = hidden_size * 4
|
||||
add_benchmark_case(benchmark_cases, batch_size, seq_len, hidden_size, intermediate_dimension, num_heads, data_type, model)
|
||||
|
||||
return benchmark_cases
|
||||
|
||||
|
||||
def benchmark_matmul(b1, b2, m, k, n, data_type, onnx_file):
|
||||
inputs, outputs = create_inputs_outputs(b1, b2, m, k, n, data_type)
|
||||
time = benchmark(onnx_file, inputs, outputs, "rocm")
|
||||
return time
|
||||
|
||||
|
||||
def main():
|
||||
for (b1, b2, m, k, n, data_type, onnx_file) in create_benchmark_cases():
|
||||
time = benchmark_matmul(b1, b2, m, k, n, data_type, onnx_file)
|
||||
tflops = b1 * b2 * m * k * n * 2 / time / 1000000000
|
||||
print(f"(b1 b2 m k n) = ({b1} {b2} {m} {k} {n}), {time:7.4f} ms, {tflops:4.2f} tflops")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
BIN
onnxruntime/python/tools/microbench/models/matmul_fp16.onnx
Normal file
BIN
onnxruntime/python/tools/microbench/models/matmul_fp16.onnx
Normal file
Binary file not shown.
BIN
onnxruntime/python/tools/microbench/models/matmul_fp32.onnx
Normal file
BIN
onnxruntime/python/tools/microbench/models/matmul_fp32.onnx
Normal file
Binary file not shown.
Loading…
Reference in a new issue