onnxruntime/onnxruntime/python/tools/microbench/benchmark.py

from abc import ABC, abstractmethod
from argparse import ArgumentParser
import time
import numpy
import onnxruntime as ort
import torch


def numpy_type(torch_type):
    type_map = {torch.float32: numpy.float32,
                torch.float16: numpy.float16}
    return type_map[torch_type]


def add_arguments(parser: ArgumentParser):
    parser.add_argument("--provider", required=False, type=str, default="rocm", help="Execution provider to use")
    parser.add_argument("--precision", required=False, type=str, default="fp16", help="Number format to use")
    parser.add_argument('--profiling', type=bool, default=False, help='If enable profiling')


class Benchmark:
    def __init__(self, model, inputs, outputs, args):
        self.provider = args.provider
        self.profiling = args.profiling
        self.model = model
        self.inputs = inputs
        self.outputs = outputs

    def create_input_output_tensors(self):
        device = "cuda"
        input_tensors = {name: torch.from_numpy(array).to(device) for name, array in self.inputs.items()}
        output_tensors = {name: torch.from_numpy(array).to(device) for name, array in self.outputs.items()}
        return input_tensors, output_tensors

    @classmethod
    def create_io_binding(cls, sess, input_tensors, output_tensors):
        io_binding = sess.io_binding()
        for name, tensor in input_tensors.items():
            io_binding.bind_input(name, tensor.device.type, 0, numpy_type(tensor.dtype), tensor.shape, tensor.data_ptr())
        for name, tensor in output_tensors.items():
            io_binding.bind_output(name, tensor.device.type, 0, numpy_type(tensor.dtype), tensor.shape, tensor.data_ptr())
        return io_binding

    def create_session(self):
        sess_opt = ort.SessionOptions()
        sess_opt.enable_profiling = self.profiling
        if self.provider == "rocm":
            execution_provider = ["ROCMExecutionProvider"]
        elif self.provider == "cuda":
            execution_provider = ["CUDAExecutionProvider"]
        else:
            raise ValueError(f"The script doesn't support provider type '{self.provider}' yet.")

        sess = ort.InferenceSession(self.model, sess_options=sess_opt, providers=execution_provider)

        if self.provider == "rocm":
            assert 'ROCMExecutionProvider' in sess.get_providers()
        elif self.provider == "cuda":
            assert 'CUDAExecutionProvider' in sess.get_providers()

        return sess

    def benchmark(self):
        sess = self.create_session()
        input_tensors, output_tensors = self.create_input_output_tensors()
        io_binding = self.create_io_binding(sess, input_tensors, output_tensors)

        # warm up
        for iter in range(10):
          sess.run_with_iobinding(io_binding)

        # measure
        max_iters = 100
        start_time = time.time()
        for iter in range(max_iters):
            sess.run_with_iobinding(io_binding)

        # time is in milliseconds
        elapsed_time = (time.time() - start_time) * 1000 / max_iters
        return elapsed_time


class BenchmarkOp(ABC):
    def __init__(self, args):
        self.args = args
        self.cases = []

    @classmethod
    @abstractmethod
    def create_inputs_outputs(cls, op_param):
        ...

    def add_case(self, op_param, model):
        self.cases += [(op_param, model)]

    @abstractmethod
    def create_cases(self):
        ...

    @classmethod
    @abstractmethod
    def case_profile(cls, op_param, time):
        ...

    def benchmark(self):
        self.create_cases()
        for op_param, model in self.cases:
            inputs, outputs = self.create_inputs_outputs(op_param)
            bm = Benchmark(model, inputs, outputs, self.args)
            time = bm.benchmark()
            print(self.case_profile(op_param, time))