From a7738b52c5fbeb6f29e33da2421fb3e0ad7dfd7e Mon Sep 17 00:00:00 2001 From: zhangyaobit <1034716+zhangyaobit@users.noreply.github.com> Date: Tue, 1 Mar 2022 16:00:16 -0800 Subject: [PATCH] Add microbench to benchmark single operators. (#10678) * Add microbench to benchmark single operators. * Move to tool directory; seperate data genration from io binding. * Refector. * Clean up. * Use precision instead for extensibility. * Refactor the create_io_binding function to take in torch tensors instead of numpy arrays; this reflects more accurately what the function does, because it is torch tensors that got bound. --- .../python/tools/microbench/benchmark.py | 61 ++++++++++++++++ onnxruntime/python/tools/microbench/matmul.py | 68 ++++++++++++++++++ .../tools/microbench/models/matmul_fp16.onnx | Bin 0 -> 171 bytes .../tools/microbench/models/matmul_fp32.onnx | Bin 0 -> 171 bytes 4 files changed, 129 insertions(+) create mode 100644 onnxruntime/python/tools/microbench/benchmark.py create mode 100644 onnxruntime/python/tools/microbench/matmul.py create mode 100644 onnxruntime/python/tools/microbench/models/matmul_fp16.onnx create mode 100644 onnxruntime/python/tools/microbench/models/matmul_fp32.onnx diff --git a/onnxruntime/python/tools/microbench/benchmark.py b/onnxruntime/python/tools/microbench/benchmark.py new file mode 100644 index 0000000000..89359782a3 --- /dev/null +++ b/onnxruntime/python/tools/microbench/benchmark.py @@ -0,0 +1,61 @@ +import time +import numpy +import onnxruntime as ort +import torch + + +def create_input_output_tensors(inputs, outputs): + device = "cuda" + input_tensors = {name: torch.from_numpy(array).to(device) for name, array in inputs.items()} + output_tensors = {name: torch.from_numpy(array).to(device) for name, array in outputs.items()} + return input_tensors, output_tensors + + +def numpy_type(torch_type): + type_map = {torch.float32: numpy.float32, + torch.float16: numpy.float16} + return type_map[torch_type] + + +def create_io_binding(sess, input_tensors, output_tensors): + io_binding = sess.io_binding() + + for name, tensor in input_tensors.items(): + io_binding.bind_input(name, tensor.device.type, 0, numpy_type(tensor.dtype), tensor.shape, tensor.data_ptr()) + + for name, tensor in output_tensors.items(): + io_binding.bind_output(name, tensor.device.type, 0, numpy_type(tensor.dtype), tensor.shape, tensor.data_ptr()) + + return io_binding + + +def create_session(onnx_file, provider, profiling): + sess_opt = ort.SessionOptions() + sess_opt.enable_profiling = profiling + if provider == "rocm": + execution_provider = ["ROCMExecutionProvider"] + else: + execution_provider = ["CUDAExecutionProvider"] + + sess = ort.InferenceSession(onnx_file, sess_options=sess_opt, providers=execution_provider) + return sess + + +def benchmark(onnx_file, inputs, outputs, provider, profiling=False): + sess = create_session(onnx_file, provider, profiling) + input_tensors, output_tensors = create_input_output_tensors(inputs, outputs) + io_binding = create_io_binding(sess, input_tensors, output_tensors) + + # warm up + for iter in range(10): + sess.run_with_iobinding(io_binding) + + # measure + max_iters = 100 + start_time = time.time() + for iter in range(max_iters): + sess.run_with_iobinding(io_binding) + + # time is in milliseconds + elapsed_time = (time.time() - start_time) * 1000 / max_iters + return elapsed_time diff --git a/onnxruntime/python/tools/microbench/matmul.py b/onnxruntime/python/tools/microbench/matmul.py new file mode 100644 index 0000000000..b7f11f0896 --- /dev/null +++ b/onnxruntime/python/tools/microbench/matmul.py @@ -0,0 +1,68 @@ +import numpy as np +from benchmark import benchmark + + +def create_inputs_outputs(b1, b2, m, k, n, data_type): + np.random.seed(0) + a = np.random.rand(b1, b2, m, k).astype(data_type) + b = np.random.rand(b1, b2, k, n).astype(data_type) + c = np.random.rand(b1, b2, m, n).astype(data_type) + + inputs = {"A": a, "B": b} + outputs = {"return_val": c} + + return inputs, outputs + + +def add_benchmark_case(benchmark_cases, batch_size, seq_len, hidden_size, intermediate_dimension, num_heads, data_type, model): + benchmark_cases += [ + (1, batch_size, seq_len, hidden_size, hidden_size, data_type, model), + (1, batch_size, seq_len, intermediate_dimension, hidden_size, data_type, model), + (1, batch_size, seq_len, hidden_size, intermediate_dimension, data_type, model), + (batch_size, num_heads, seq_len, seq_len, int(hidden_size / num_heads), data_type, model), + ] + + +def create_benchmark_cases(precision="fp16"): + benchmark_cases = [] + if precision == "fp16": + model = "models/matmul_fp16.onnx" + data_type = np.float16 + else: + model = "models/matmul_fp32.onnx" + data_type = np.float32 + + # bert-large + hidden_size = 1024 + seq_len = 384 + num_heads = 16 + batch_size = 1 + intermediate_dimension = hidden_size * 4 + add_benchmark_case(benchmark_cases, batch_size, seq_len, hidden_size, intermediate_dimension, num_heads, data_type, model) + + # bert-base + hidden_size = 768 + seq_len = 384 + num_heads = 12 + batch_size = 1 + intermediate_dimension = hidden_size * 4 + add_benchmark_case(benchmark_cases, batch_size, seq_len, hidden_size, intermediate_dimension, num_heads, data_type, model) + + return benchmark_cases + + +def benchmark_matmul(b1, b2, m, k, n, data_type, onnx_file): + inputs, outputs = create_inputs_outputs(b1, b2, m, k, n, data_type) + time = benchmark(onnx_file, inputs, outputs, "rocm") + return time + + +def main(): + for (b1, b2, m, k, n, data_type, onnx_file) in create_benchmark_cases(): + time = benchmark_matmul(b1, b2, m, k, n, data_type, onnx_file) + tflops = b1 * b2 * m * k * n * 2 / time / 1000000000 + print(f"(b1 b2 m k n) = ({b1} {b2} {m} {k} {n}), {time:7.4f} ms, {tflops:4.2f} tflops") + + +if __name__ == "__main__": + main() diff --git a/onnxruntime/python/tools/microbench/models/matmul_fp16.onnx b/onnxruntime/python/tools/microbench/models/matmul_fp16.onnx new file mode 100644 index 0000000000000000000000000000000000000000..d227b1abd3af072a66ee5f8c0e8507d1a583df70 GIT binary patch literal 171 zcmd