diff --git a/onnxruntime/python/tools/microbench/benchmark.py b/onnxruntime/python/tools/microbench/benchmark.py index e7fabd0335..cb8e5f57c2 100644 --- a/onnxruntime/python/tools/microbench/benchmark.py +++ b/onnxruntime/python/tools/microbench/benchmark.py @@ -1,3 +1,4 @@ +from abc import ABC, abstractmethod from argparse import ArgumentParser import time import numpy @@ -5,72 +6,106 @@ import onnxruntime as ort import torch -def add_arguments(parser: ArgumentParser): - parser.add_argument("--provider", required=False, type=str, default="rocm", help="Execution provider to use") - parser.add_argument("--precision", required=False, type=str, default="fp16", help="Number format to use") - parser.add_argument('--profiling', type=bool, default=False, help='If enable profiling') - - -def create_input_output_tensors(inputs, outputs): - device = "cuda" - input_tensors = {name: torch.from_numpy(array).to(device) for name, array in inputs.items()} - output_tensors = {name: torch.from_numpy(array).to(device) for name, array in outputs.items()} - return input_tensors, output_tensors - - def numpy_type(torch_type): type_map = {torch.float32: numpy.float32, torch.float16: numpy.float16} return type_map[torch_type] -def create_io_binding(sess, input_tensors, output_tensors): - io_binding = sess.io_binding() - - for name, tensor in input_tensors.items(): - io_binding.bind_input(name, tensor.device.type, 0, numpy_type(tensor.dtype), tensor.shape, tensor.data_ptr()) - - for name, tensor in output_tensors.items(): - io_binding.bind_output(name, tensor.device.type, 0, numpy_type(tensor.dtype), tensor.shape, tensor.data_ptr()) - - return io_binding +def add_arguments(parser: ArgumentParser): + parser.add_argument("--provider", required=False, type=str, default="rocm", help="Execution provider to use") + parser.add_argument("--precision", required=False, type=str, default="fp16", help="Number format to use") + parser.add_argument('--profiling', type=bool, default=False, help='If enable profiling') -def create_session(onnx_file, args): - sess_opt = ort.SessionOptions() - sess_opt.enable_profiling = args.profiling - if args.provider == "rocm": - execution_provider = ["ROCMExecutionProvider"] - elif args.provider == "cuda": - execution_provider = ["CUDAExecutionProvider"] - else: - raise ValueError(f"The script doesn't support provider type '{args.provider}' yet.") +class Benchmark: + def __init__(self, model, inputs, outputs, args): + self.provider = args.provider + self.profiling = args.profiling + self.model = model + self.inputs = inputs + self.outputs = outputs - sess = ort.InferenceSession(onnx_file, sess_options=sess_opt, providers=execution_provider) + def create_input_output_tensors(self): + device = "cuda" + input_tensors = {name: torch.from_numpy(array).to(device) for name, array in self.inputs.items()} + output_tensors = {name: torch.from_numpy(array).to(device) for name, array in self.outputs.items()} + return input_tensors, output_tensors - if args.provider == "rocm": - assert 'ROCMExecutionProvider' in sess.get_providers() - elif args.provider == "cuda": - assert 'CUDAExecutionProvider' in sess.get_providers() + @classmethod + def create_io_binding(cls, sess, input_tensors, output_tensors): + io_binding = sess.io_binding() + for name, tensor in input_tensors.items(): + io_binding.bind_input(name, tensor.device.type, 0, numpy_type(tensor.dtype), tensor.shape, tensor.data_ptr()) + for name, tensor in output_tensors.items(): + io_binding.bind_output(name, tensor.device.type, 0, numpy_type(tensor.dtype), tensor.shape, tensor.data_ptr()) + return io_binding - return sess - + def create_session(self): + sess_opt = ort.SessionOptions() + sess_opt.enable_profiling = self.profiling + if self.provider == "rocm": + execution_provider = ["ROCMExecutionProvider"] + elif self.provider == "cuda": + execution_provider = ["CUDAExecutionProvider"] + else: + raise ValueError(f"The script doesn't support provider type '{self.provider}' yet.") -def benchmark(onnx_file, inputs, outputs, args): - sess = create_session(onnx_file, args) - input_tensors, output_tensors = create_input_output_tensors(inputs, outputs) - io_binding = create_io_binding(sess, input_tensors, output_tensors) + sess = ort.InferenceSession(self.model, sess_options=sess_opt, providers=execution_provider) - # warm up - for iter in range(10): - sess.run_with_iobinding(io_binding) + if self.provider == "rocm": + assert 'ROCMExecutionProvider' in sess.get_providers() + elif self.provider == "cuda": + assert 'CUDAExecutionProvider' in sess.get_providers() + + return sess + + def benchmark(self): + sess = self.create_session() + input_tensors, output_tensors = self.create_input_output_tensors() + io_binding = self.create_io_binding(sess, input_tensors, output_tensors) - # measure - max_iters = 100 - start_time = time.time() - for iter in range(max_iters): - sess.run_with_iobinding(io_binding) - - # time is in milliseconds - elapsed_time = (time.time() - start_time) * 1000 / max_iters - return elapsed_time + # warm up + for iter in range(10): + sess.run_with_iobinding(io_binding) + + # measure + max_iters = 100 + start_time = time.time() + for iter in range(max_iters): + sess.run_with_iobinding(io_binding) + + # time is in milliseconds + elapsed_time = (time.time() - start_time) * 1000 / max_iters + return elapsed_time + + +class BenchmarkOp(ABC): + def __init__(self, args): + self.args = args + self.cases = [] + + @classmethod + @abstractmethod + def create_inputs_outputs(cls, op_param): + ... + + def add_case(self, op_param, model): + self.cases += [(op_param, model)] + + @abstractmethod + def create_cases(self): + ... + + @classmethod + @abstractmethod + def case_profile(cls, op_param, time): + ... + + def benchmark(self): + self.create_cases() + for op_param, model in self.cases: + inputs, outputs = self.create_inputs_outputs(op_param) + bm = Benchmark(model, inputs, outputs, self.args) + time = bm.benchmark() + print(self.case_profile(op_param, time)) diff --git a/onnxruntime/python/tools/microbench/fast_gelu.py b/onnxruntime/python/tools/microbench/fast_gelu.py index d5eeec0b98..3014cf6234 100644 --- a/onnxruntime/python/tools/microbench/fast_gelu.py +++ b/onnxruntime/python/tools/microbench/fast_gelu.py @@ -1,59 +1,57 @@ import argparse +from dataclasses import dataclass import numpy as np -from benchmark import benchmark, add_arguments +from benchmark import BenchmarkOp, add_arguments -def create_inputs_outputs(batch, seq_len, intermediate_dimension, data_type): - np.random.seed(0) - a = np.random.rand(batch, seq_len, intermediate_dimension).astype(data_type) - b = np.random.rand(intermediate_dimension).astype(data_type) - c = np.random.rand(batch, seq_len, intermediate_dimension).astype(data_type) - - inputs = {"A": a, "B": b} - outputs = {"return_val": c} - - return inputs, outputs +@dataclass +class OpParam: + dim1: int + dim2: int + dim3: int + data_type: type -def add_benchmark_case(benchmark_cases, batch_size, seq_len, intermediate_dimension, data_type, model): - benchmark_cases += [ - (batch_size, seq_len, intermediate_dimension, data_type, model), - ] +@dataclass +class ModelParam: + batch_size: int + seq_len: int + inter_dim: int + data_type: type -def create_benchmark_cases(precision): - benchmark_cases = [] - if precision == "fp16": - model = "models/fast_gelu_fp16.onnx" - data_type = np.float16 - else: - model = "models/fast_gelu_fp32.onnx" - data_type = np.float32 +class BenchmarkFastGelu(BenchmarkOp): + def __init__(self, args): + BenchmarkOp.__init__(self, args) - # bert-large - hidden_size = 1024 - seq_len = 384 - batch_size = 1 - intermediate_dimension = hidden_size * 4 - add_benchmark_case(benchmark_cases, batch_size, seq_len, intermediate_dimension, data_type, model) + def create_inputs_outputs(cls, op_param): + np.random.seed(0) + a = np.random.rand(op_param.dim1, op_param.dim2, op_param.dim3).astype(op_param.data_type) + b = np.random.rand(op_param.dim3).astype(op_param.data_type) + c = np.random.rand(op_param.dim1, op_param.dim2, op_param.dim3).astype(op_param.data_type) + inputs = {"A": a, "B": b} + outputs = {"return_val": c} + return inputs, outputs - return benchmark_cases + def create_cases(self): + model = "models/fast_gelu_fp16.onnx" if self.args.precision == "fp16" else "models/fast_gelu_fp32.onnx" + data_type = np.float16 if self.args.precision == "fp16" else np.float32 + # bert-large + model_param = ModelParam(1, 384, 1024 * 4, data_type) + op_param = OpParam(model_param.batch_size, model_param.seq_len, model_param.inter_dim, model_param.data_type) + self.add_case(op_param, model) - -def benchmark_fast_gelu(batch, seq_len, intermediate_dimension, data_type, onnx_file, args): - inputs, outputs = create_inputs_outputs(batch, seq_len, intermediate_dimension, data_type) - time = benchmark(onnx_file, inputs, outputs, args) - return time + def case_profile(cls, op_param, time): + profile = f"(dim1 dim2 dim3) = ({op_param.dim1} {op_param.dim2} {op_param.dim3}), {time:7.4f} ms" + return profile def main(): parser = argparse.ArgumentParser() add_arguments(parser) args = parser.parse_args() - - for (batch, seq_len, intermediate_dimension, data_type, onnx_file) in create_benchmark_cases(args.precision): - time = benchmark_fast_gelu(batch, seq_len, intermediate_dimension, data_type, onnx_file, args) - print(f"(batch seq_len inter_dim) = ({batch} {seq_len} {intermediate_dimension}), {time:7.4f} ms") + bm = BenchmarkFastGelu(args) + bm.benchmark() if __name__ == "__main__": diff --git a/onnxruntime/python/tools/microbench/matmul.py b/onnxruntime/python/tools/microbench/matmul.py index 006a097301..8c091d97f0 100644 --- a/onnxruntime/python/tools/microbench/matmul.py +++ b/onnxruntime/python/tools/microbench/matmul.py @@ -1,72 +1,70 @@ import argparse +from dataclasses import dataclass import numpy as np -from benchmark import benchmark, add_arguments +from benchmark import BenchmarkOp, add_arguments -def create_inputs_outputs(b1, b2, m, k, n, data_type): - np.random.seed(0) - a = np.random.rand(b1, b2, m, k).astype(data_type) - b = np.random.rand(b1, b2, k, n).astype(data_type) - c = np.random.rand(b1, b2, m, n).astype(data_type) - - inputs = {"A": a, "B": b} - outputs = {"return_val": c} - - return inputs, outputs +@dataclass +class OpParam: + b1: int + b2: int + m: int + k: int + n: int + data_type: type -def add_benchmark_case(benchmark_cases, batch_size, seq_len, hidden_size, intermediate_dimension, num_heads, data_type, model): - benchmark_cases += [ - (1, batch_size, seq_len, hidden_size, hidden_size, data_type, model), - (1, batch_size, seq_len, intermediate_dimension, hidden_size, data_type, model), - (1, batch_size, seq_len, hidden_size, intermediate_dimension, data_type, model), - (batch_size, num_heads, seq_len, seq_len, int(hidden_size / num_heads), data_type, model), - ] +@dataclass +class ModelParam: + batch_size: int + seq_len: int + hidden_size: int + inter_dim: int + num_heads: int + data_type: type -def create_benchmark_cases(precision): - benchmark_cases = [] - if precision == "fp16": - model = "models/matmul_fp16.onnx" - data_type = np.float16 - else: - model = "models/matmul_fp32.onnx" - data_type = np.float32 +class BenchmarkMatMul(BenchmarkOp): + def __init__(self, args): + BenchmarkOp.__init__(self, args) - # bert-large - hidden_size = 1024 - seq_len = 384 - num_heads = 16 - batch_size = 1 - intermediate_dimension = hidden_size * 4 - add_benchmark_case(benchmark_cases, batch_size, seq_len, hidden_size, intermediate_dimension, num_heads, data_type, model) + def create_inputs_outputs(cls, op_param): + np.random.seed(0) + a = np.random.rand(op_param.b1, op_param.b2, op_param.m, op_param.k).astype(op_param.data_type) + b = np.random.rand(op_param.b1, op_param.b2, op_param.k, op_param.n).astype(op_param.data_type) + c = np.random.rand(op_param.b1, op_param.b2, op_param.m, op_param.n).astype(op_param.data_type) + inputs = {"A": a, "B": b} + outputs = {"return_val": c} + return inputs, outputs - # bert-base - hidden_size = 768 - seq_len = 384 - num_heads = 12 - batch_size = 1 - intermediate_dimension = hidden_size * 4 - add_benchmark_case(benchmark_cases, batch_size, seq_len, hidden_size, intermediate_dimension, num_heads, data_type, model) + def add_model_cases(self, mp, model): + self.add_case(OpParam(1, mp.batch_size, mp.seq_len, mp.hidden_size, mp.hidden_size, mp.data_type), model) + self.add_case(OpParam(1, mp.batch_size, mp.seq_len, mp.inter_dim, mp.hidden_size, mp.data_type), model) + self.add_case(OpParam(1, mp.batch_size, mp.seq_len, mp.hidden_size, mp.inter_dim, mp.data_type), model) + self.add_case(OpParam(mp.batch_size, mp.num_heads, mp.seq_len, mp.seq_len, int(mp.hidden_size / mp.num_heads), mp.data_type), model) - return benchmark_cases + def create_cases(self): + model = "models/matmul_fp16.onnx" if self.args.precision == "fp16" else "models/matmul_fp32.onnx" + data_type = np.float16 if self.args.precision == "fp16" else np.float32 + # bert-large + model_param = ModelParam(1, 384, 1024, 1024 * 4, 16, data_type) + self.add_model_cases(model_param, model) + # bert-base + model_param = ModelParam(1, 384, 768, 768 * 4, 12, data_type) + self.add_model_cases(model_param, model) - -def benchmark_matmul(b1, b2, m, k, n, data_type, onnx_file, args): - inputs, outputs = create_inputs_outputs(b1, b2, m, k, n, data_type) - time = benchmark(onnx_file, inputs, outputs, args) - return time + def case_profile(cls, op_param, time): + tflops = op_param.b1 * op_param.b2 * op_param.m * op_param.k * op_param.n * 2 / time / 1000000000 + profile = f"(b1 b2 m k n) = ({op_param.b1} {op_param.b2} {op_param.m} {op_param.k} {op_param.n}), {time:7.4f} ms, {tflops:4.2f} tflops" + return profile def main(): parser = argparse.ArgumentParser() add_arguments(parser) args = parser.parse_args() - - for (b1, b2, m, k, n, data_type, onnx_file) in create_benchmark_cases(args.precision): - time = benchmark_matmul(b1, b2, m, k, n, data_type, onnx_file, args) - tflops = b1 * b2 * m * k * n * 2 / time / 1000000000 - print(f"(b1 b2 m k n) = ({b1} {b2} {m} {k} {n}), {time:7.4f} ms, {tflops:4.2f} tflops") + bm = BenchmarkMatMul(args) + bm.benchmark() if __name__ == "__main__":