onnxruntime/onnxruntime/python/tools/microbench/benchmark.py

111 lines
3.7 KiB
Python

from abc import ABC, abstractmethod
from argparse import ArgumentParser
import time
import numpy
import onnxruntime as ort
import torch
def numpy_type(torch_type):
type_map = {torch.float32: numpy.float32,
torch.float16: numpy.float16}
return type_map[torch_type]
def add_arguments(parser: ArgumentParser):
parser.add_argument("--provider", required=False, type=str, default="rocm", help="Execution provider to use")
parser.add_argument("--precision", required=False, type=str, default="fp16", help="Number format to use")
parser.add_argument('--profiling', type=bool, default=False, help='If enable profiling')
class Benchmark:
def __init__(self, model, inputs, outputs, args):
self.provider = args.provider
self.profiling = args.profiling
self.model = model
self.inputs = inputs
self.outputs = outputs
def create_input_output_tensors(self):
device = "cuda"
input_tensors = {name: torch.from_numpy(array).to(device) for name, array in self.inputs.items()}
output_tensors = {name: torch.from_numpy(array).to(device) for name, array in self.outputs.items()}
return input_tensors, output_tensors
@classmethod
def create_io_binding(cls, sess, input_tensors, output_tensors):
io_binding = sess.io_binding()
for name, tensor in input_tensors.items():
io_binding.bind_input(name, tensor.device.type, 0, numpy_type(tensor.dtype), tensor.shape, tensor.data_ptr())
for name, tensor in output_tensors.items():
io_binding.bind_output(name, tensor.device.type, 0, numpy_type(tensor.dtype), tensor.shape, tensor.data_ptr())
return io_binding
def create_session(self):
sess_opt = ort.SessionOptions()
sess_opt.enable_profiling = self.profiling
if self.provider == "rocm":
execution_provider = ["ROCMExecutionProvider"]
elif self.provider == "cuda":
execution_provider = ["CUDAExecutionProvider"]
else:
raise ValueError(f"The script doesn't support provider type '{self.provider}' yet.")
sess = ort.InferenceSession(self.model, sess_options=sess_opt, providers=execution_provider)
if self.provider == "rocm":
assert 'ROCMExecutionProvider' in sess.get_providers()
elif self.provider == "cuda":
assert 'CUDAExecutionProvider' in sess.get_providers()
return sess
def benchmark(self):
sess = self.create_session()
input_tensors, output_tensors = self.create_input_output_tensors()
io_binding = self.create_io_binding(sess, input_tensors, output_tensors)
# warm up
for iter in range(10):
sess.run_with_iobinding(io_binding)
# measure
max_iters = 100
start_time = time.time()
for iter in range(max_iters):
sess.run_with_iobinding(io_binding)
# time is in milliseconds
elapsed_time = (time.time() - start_time) * 1000 / max_iters
return elapsed_time
class BenchmarkOp(ABC):
def __init__(self, args):
self.args = args
self.cases = []
@classmethod
@abstractmethod
def create_inputs_outputs(cls, op_param):
...
def add_case(self, op_param, model):
self.cases += [(op_param, model)]
@abstractmethod
def create_cases(self):
...
@classmethod
@abstractmethod
def case_profile(cls, op_param, time):
...
def benchmark(self):
self.create_cases()
for op_param, model in self.cases:
inputs, outputs = self.create_inputs_outputs(op_param)
bm = Benchmark(model, inputs, outputs, self.args)
time = bm.benchmark()
print(self.case_profile(op_param, time))