mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-29 23:06:41 +00:00
Refactor the common code per operator into an abstract base class. (#10785)
This commit is contained in:
parent
a08036da09
commit
b7f00b9682
3 changed files with 176 additions and 145 deletions
|
|
@ -1,3 +1,4 @@
|
|||
from abc import ABC, abstractmethod
|
||||
from argparse import ArgumentParser
|
||||
import time
|
||||
import numpy
|
||||
|
|
@ -5,72 +6,106 @@ import onnxruntime as ort
|
|||
import torch
|
||||
|
||||
|
||||
def add_arguments(parser: ArgumentParser):
|
||||
parser.add_argument("--provider", required=False, type=str, default="rocm", help="Execution provider to use")
|
||||
parser.add_argument("--precision", required=False, type=str, default="fp16", help="Number format to use")
|
||||
parser.add_argument('--profiling', type=bool, default=False, help='If enable profiling')
|
||||
|
||||
|
||||
def create_input_output_tensors(inputs, outputs):
|
||||
device = "cuda"
|
||||
input_tensors = {name: torch.from_numpy(array).to(device) for name, array in inputs.items()}
|
||||
output_tensors = {name: torch.from_numpy(array).to(device) for name, array in outputs.items()}
|
||||
return input_tensors, output_tensors
|
||||
|
||||
|
||||
def numpy_type(torch_type):
|
||||
type_map = {torch.float32: numpy.float32,
|
||||
torch.float16: numpy.float16}
|
||||
return type_map[torch_type]
|
||||
|
||||
|
||||
def create_io_binding(sess, input_tensors, output_tensors):
|
||||
io_binding = sess.io_binding()
|
||||
|
||||
for name, tensor in input_tensors.items():
|
||||
io_binding.bind_input(name, tensor.device.type, 0, numpy_type(tensor.dtype), tensor.shape, tensor.data_ptr())
|
||||
|
||||
for name, tensor in output_tensors.items():
|
||||
io_binding.bind_output(name, tensor.device.type, 0, numpy_type(tensor.dtype), tensor.shape, tensor.data_ptr())
|
||||
|
||||
return io_binding
|
||||
def add_arguments(parser: ArgumentParser):
|
||||
parser.add_argument("--provider", required=False, type=str, default="rocm", help="Execution provider to use")
|
||||
parser.add_argument("--precision", required=False, type=str, default="fp16", help="Number format to use")
|
||||
parser.add_argument('--profiling', type=bool, default=False, help='If enable profiling')
|
||||
|
||||
|
||||
def create_session(onnx_file, args):
|
||||
sess_opt = ort.SessionOptions()
|
||||
sess_opt.enable_profiling = args.profiling
|
||||
if args.provider == "rocm":
|
||||
execution_provider = ["ROCMExecutionProvider"]
|
||||
elif args.provider == "cuda":
|
||||
execution_provider = ["CUDAExecutionProvider"]
|
||||
else:
|
||||
raise ValueError(f"The script doesn't support provider type '{args.provider}' yet.")
|
||||
class Benchmark:
|
||||
def __init__(self, model, inputs, outputs, args):
|
||||
self.provider = args.provider
|
||||
self.profiling = args.profiling
|
||||
self.model = model
|
||||
self.inputs = inputs
|
||||
self.outputs = outputs
|
||||
|
||||
sess = ort.InferenceSession(onnx_file, sess_options=sess_opt, providers=execution_provider)
|
||||
def create_input_output_tensors(self):
|
||||
device = "cuda"
|
||||
input_tensors = {name: torch.from_numpy(array).to(device) for name, array in self.inputs.items()}
|
||||
output_tensors = {name: torch.from_numpy(array).to(device) for name, array in self.outputs.items()}
|
||||
return input_tensors, output_tensors
|
||||
|
||||
if args.provider == "rocm":
|
||||
assert 'ROCMExecutionProvider' in sess.get_providers()
|
||||
elif args.provider == "cuda":
|
||||
assert 'CUDAExecutionProvider' in sess.get_providers()
|
||||
@classmethod
|
||||
def create_io_binding(cls, sess, input_tensors, output_tensors):
|
||||
io_binding = sess.io_binding()
|
||||
for name, tensor in input_tensors.items():
|
||||
io_binding.bind_input(name, tensor.device.type, 0, numpy_type(tensor.dtype), tensor.shape, tensor.data_ptr())
|
||||
for name, tensor in output_tensors.items():
|
||||
io_binding.bind_output(name, tensor.device.type, 0, numpy_type(tensor.dtype), tensor.shape, tensor.data_ptr())
|
||||
return io_binding
|
||||
|
||||
return sess
|
||||
|
||||
def create_session(self):
|
||||
sess_opt = ort.SessionOptions()
|
||||
sess_opt.enable_profiling = self.profiling
|
||||
if self.provider == "rocm":
|
||||
execution_provider = ["ROCMExecutionProvider"]
|
||||
elif self.provider == "cuda":
|
||||
execution_provider = ["CUDAExecutionProvider"]
|
||||
else:
|
||||
raise ValueError(f"The script doesn't support provider type '{self.provider}' yet.")
|
||||
|
||||
def benchmark(onnx_file, inputs, outputs, args):
|
||||
sess = create_session(onnx_file, args)
|
||||
input_tensors, output_tensors = create_input_output_tensors(inputs, outputs)
|
||||
io_binding = create_io_binding(sess, input_tensors, output_tensors)
|
||||
sess = ort.InferenceSession(self.model, sess_options=sess_opt, providers=execution_provider)
|
||||
|
||||
# warm up
|
||||
for iter in range(10):
|
||||
sess.run_with_iobinding(io_binding)
|
||||
if self.provider == "rocm":
|
||||
assert 'ROCMExecutionProvider' in sess.get_providers()
|
||||
elif self.provider == "cuda":
|
||||
assert 'CUDAExecutionProvider' in sess.get_providers()
|
||||
|
||||
return sess
|
||||
|
||||
def benchmark(self):
|
||||
sess = self.create_session()
|
||||
input_tensors, output_tensors = self.create_input_output_tensors()
|
||||
io_binding = self.create_io_binding(sess, input_tensors, output_tensors)
|
||||
|
||||
# measure
|
||||
max_iters = 100
|
||||
start_time = time.time()
|
||||
for iter in range(max_iters):
|
||||
sess.run_with_iobinding(io_binding)
|
||||
|
||||
# time is in milliseconds
|
||||
elapsed_time = (time.time() - start_time) * 1000 / max_iters
|
||||
return elapsed_time
|
||||
# warm up
|
||||
for iter in range(10):
|
||||
sess.run_with_iobinding(io_binding)
|
||||
|
||||
# measure
|
||||
max_iters = 100
|
||||
start_time = time.time()
|
||||
for iter in range(max_iters):
|
||||
sess.run_with_iobinding(io_binding)
|
||||
|
||||
# time is in milliseconds
|
||||
elapsed_time = (time.time() - start_time) * 1000 / max_iters
|
||||
return elapsed_time
|
||||
|
||||
|
||||
class BenchmarkOp(ABC):
|
||||
def __init__(self, args):
|
||||
self.args = args
|
||||
self.cases = []
|
||||
|
||||
@classmethod
|
||||
@abstractmethod
|
||||
def create_inputs_outputs(cls, op_param):
|
||||
...
|
||||
|
||||
def add_case(self, op_param, model):
|
||||
self.cases += [(op_param, model)]
|
||||
|
||||
@abstractmethod
|
||||
def create_cases(self):
|
||||
...
|
||||
|
||||
@classmethod
|
||||
@abstractmethod
|
||||
def case_profile(cls, op_param, time):
|
||||
...
|
||||
|
||||
def benchmark(self):
|
||||
self.create_cases()
|
||||
for op_param, model in self.cases:
|
||||
inputs, outputs = self.create_inputs_outputs(op_param)
|
||||
bm = Benchmark(model, inputs, outputs, self.args)
|
||||
time = bm.benchmark()
|
||||
print(self.case_profile(op_param, time))
|
||||
|
|
|
|||
|
|
@ -1,59 +1,57 @@
|
|||
import argparse
|
||||
from dataclasses import dataclass
|
||||
import numpy as np
|
||||
from benchmark import benchmark, add_arguments
|
||||
from benchmark import BenchmarkOp, add_arguments
|
||||
|
||||
|
||||
def create_inputs_outputs(batch, seq_len, intermediate_dimension, data_type):
|
||||
np.random.seed(0)
|
||||
a = np.random.rand(batch, seq_len, intermediate_dimension).astype(data_type)
|
||||
b = np.random.rand(intermediate_dimension).astype(data_type)
|
||||
c = np.random.rand(batch, seq_len, intermediate_dimension).astype(data_type)
|
||||
|
||||
inputs = {"A": a, "B": b}
|
||||
outputs = {"return_val": c}
|
||||
|
||||
return inputs, outputs
|
||||
@dataclass
|
||||
class OpParam:
|
||||
dim1: int
|
||||
dim2: int
|
||||
dim3: int
|
||||
data_type: type
|
||||
|
||||
|
||||
def add_benchmark_case(benchmark_cases, batch_size, seq_len, intermediate_dimension, data_type, model):
|
||||
benchmark_cases += [
|
||||
(batch_size, seq_len, intermediate_dimension, data_type, model),
|
||||
]
|
||||
@dataclass
|
||||
class ModelParam:
|
||||
batch_size: int
|
||||
seq_len: int
|
||||
inter_dim: int
|
||||
data_type: type
|
||||
|
||||
|
||||
def create_benchmark_cases(precision):
|
||||
benchmark_cases = []
|
||||
if precision == "fp16":
|
||||
model = "models/fast_gelu_fp16.onnx"
|
||||
data_type = np.float16
|
||||
else:
|
||||
model = "models/fast_gelu_fp32.onnx"
|
||||
data_type = np.float32
|
||||
class BenchmarkFastGelu(BenchmarkOp):
|
||||
def __init__(self, args):
|
||||
BenchmarkOp.__init__(self, args)
|
||||
|
||||
# bert-large
|
||||
hidden_size = 1024
|
||||
seq_len = 384
|
||||
batch_size = 1
|
||||
intermediate_dimension = hidden_size * 4
|
||||
add_benchmark_case(benchmark_cases, batch_size, seq_len, intermediate_dimension, data_type, model)
|
||||
def create_inputs_outputs(cls, op_param):
|
||||
np.random.seed(0)
|
||||
a = np.random.rand(op_param.dim1, op_param.dim2, op_param.dim3).astype(op_param.data_type)
|
||||
b = np.random.rand(op_param.dim3).astype(op_param.data_type)
|
||||
c = np.random.rand(op_param.dim1, op_param.dim2, op_param.dim3).astype(op_param.data_type)
|
||||
inputs = {"A": a, "B": b}
|
||||
outputs = {"return_val": c}
|
||||
return inputs, outputs
|
||||
|
||||
return benchmark_cases
|
||||
def create_cases(self):
|
||||
model = "models/fast_gelu_fp16.onnx" if self.args.precision == "fp16" else "models/fast_gelu_fp32.onnx"
|
||||
data_type = np.float16 if self.args.precision == "fp16" else np.float32
|
||||
# bert-large
|
||||
model_param = ModelParam(1, 384, 1024 * 4, data_type)
|
||||
op_param = OpParam(model_param.batch_size, model_param.seq_len, model_param.inter_dim, model_param.data_type)
|
||||
self.add_case(op_param, model)
|
||||
|
||||
|
||||
def benchmark_fast_gelu(batch, seq_len, intermediate_dimension, data_type, onnx_file, args):
|
||||
inputs, outputs = create_inputs_outputs(batch, seq_len, intermediate_dimension, data_type)
|
||||
time = benchmark(onnx_file, inputs, outputs, args)
|
||||
return time
|
||||
def case_profile(cls, op_param, time):
|
||||
profile = f"(dim1 dim2 dim3) = ({op_param.dim1} {op_param.dim2} {op_param.dim3}), {time:7.4f} ms"
|
||||
return profile
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
add_arguments(parser)
|
||||
args = parser.parse_args()
|
||||
|
||||
for (batch, seq_len, intermediate_dimension, data_type, onnx_file) in create_benchmark_cases(args.precision):
|
||||
time = benchmark_fast_gelu(batch, seq_len, intermediate_dimension, data_type, onnx_file, args)
|
||||
print(f"(batch seq_len inter_dim) = ({batch} {seq_len} {intermediate_dimension}), {time:7.4f} ms")
|
||||
bm = BenchmarkFastGelu(args)
|
||||
bm.benchmark()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
|
|
@ -1,72 +1,70 @@
|
|||
import argparse
|
||||
from dataclasses import dataclass
|
||||
import numpy as np
|
||||
from benchmark import benchmark, add_arguments
|
||||
from benchmark import BenchmarkOp, add_arguments
|
||||
|
||||
|
||||
def create_inputs_outputs(b1, b2, m, k, n, data_type):
|
||||
np.random.seed(0)
|
||||
a = np.random.rand(b1, b2, m, k).astype(data_type)
|
||||
b = np.random.rand(b1, b2, k, n).astype(data_type)
|
||||
c = np.random.rand(b1, b2, m, n).astype(data_type)
|
||||
|
||||
inputs = {"A": a, "B": b}
|
||||
outputs = {"return_val": c}
|
||||
|
||||
return inputs, outputs
|
||||
@dataclass
|
||||
class OpParam:
|
||||
b1: int
|
||||
b2: int
|
||||
m: int
|
||||
k: int
|
||||
n: int
|
||||
data_type: type
|
||||
|
||||
|
||||
def add_benchmark_case(benchmark_cases, batch_size, seq_len, hidden_size, intermediate_dimension, num_heads, data_type, model):
|
||||
benchmark_cases += [
|
||||
(1, batch_size, seq_len, hidden_size, hidden_size, data_type, model),
|
||||
(1, batch_size, seq_len, intermediate_dimension, hidden_size, data_type, model),
|
||||
(1, batch_size, seq_len, hidden_size, intermediate_dimension, data_type, model),
|
||||
(batch_size, num_heads, seq_len, seq_len, int(hidden_size / num_heads), data_type, model),
|
||||
]
|
||||
@dataclass
|
||||
class ModelParam:
|
||||
batch_size: int
|
||||
seq_len: int
|
||||
hidden_size: int
|
||||
inter_dim: int
|
||||
num_heads: int
|
||||
data_type: type
|
||||
|
||||
|
||||
def create_benchmark_cases(precision):
|
||||
benchmark_cases = []
|
||||
if precision == "fp16":
|
||||
model = "models/matmul_fp16.onnx"
|
||||
data_type = np.float16
|
||||
else:
|
||||
model = "models/matmul_fp32.onnx"
|
||||
data_type = np.float32
|
||||
class BenchmarkMatMul(BenchmarkOp):
|
||||
def __init__(self, args):
|
||||
BenchmarkOp.__init__(self, args)
|
||||
|
||||
# bert-large
|
||||
hidden_size = 1024
|
||||
seq_len = 384
|
||||
num_heads = 16
|
||||
batch_size = 1
|
||||
intermediate_dimension = hidden_size * 4
|
||||
add_benchmark_case(benchmark_cases, batch_size, seq_len, hidden_size, intermediate_dimension, num_heads, data_type, model)
|
||||
def create_inputs_outputs(cls, op_param):
|
||||
np.random.seed(0)
|
||||
a = np.random.rand(op_param.b1, op_param.b2, op_param.m, op_param.k).astype(op_param.data_type)
|
||||
b = np.random.rand(op_param.b1, op_param.b2, op_param.k, op_param.n).astype(op_param.data_type)
|
||||
c = np.random.rand(op_param.b1, op_param.b2, op_param.m, op_param.n).astype(op_param.data_type)
|
||||
inputs = {"A": a, "B": b}
|
||||
outputs = {"return_val": c}
|
||||
return inputs, outputs
|
||||
|
||||
# bert-base
|
||||
hidden_size = 768
|
||||
seq_len = 384
|
||||
num_heads = 12
|
||||
batch_size = 1
|
||||
intermediate_dimension = hidden_size * 4
|
||||
add_benchmark_case(benchmark_cases, batch_size, seq_len, hidden_size, intermediate_dimension, num_heads, data_type, model)
|
||||
def add_model_cases(self, mp, model):
|
||||
self.add_case(OpParam(1, mp.batch_size, mp.seq_len, mp.hidden_size, mp.hidden_size, mp.data_type), model)
|
||||
self.add_case(OpParam(1, mp.batch_size, mp.seq_len, mp.inter_dim, mp.hidden_size, mp.data_type), model)
|
||||
self.add_case(OpParam(1, mp.batch_size, mp.seq_len, mp.hidden_size, mp.inter_dim, mp.data_type), model)
|
||||
self.add_case(OpParam(mp.batch_size, mp.num_heads, mp.seq_len, mp.seq_len, int(mp.hidden_size / mp.num_heads), mp.data_type), model)
|
||||
|
||||
return benchmark_cases
|
||||
def create_cases(self):
|
||||
model = "models/matmul_fp16.onnx" if self.args.precision == "fp16" else "models/matmul_fp32.onnx"
|
||||
data_type = np.float16 if self.args.precision == "fp16" else np.float32
|
||||
# bert-large
|
||||
model_param = ModelParam(1, 384, 1024, 1024 * 4, 16, data_type)
|
||||
self.add_model_cases(model_param, model)
|
||||
# bert-base
|
||||
model_param = ModelParam(1, 384, 768, 768 * 4, 12, data_type)
|
||||
self.add_model_cases(model_param, model)
|
||||
|
||||
|
||||
def benchmark_matmul(b1, b2, m, k, n, data_type, onnx_file, args):
|
||||
inputs, outputs = create_inputs_outputs(b1, b2, m, k, n, data_type)
|
||||
time = benchmark(onnx_file, inputs, outputs, args)
|
||||
return time
|
||||
def case_profile(cls, op_param, time):
|
||||
tflops = op_param.b1 * op_param.b2 * op_param.m * op_param.k * op_param.n * 2 / time / 1000000000
|
||||
profile = f"(b1 b2 m k n) = ({op_param.b1} {op_param.b2} {op_param.m} {op_param.k} {op_param.n}), {time:7.4f} ms, {tflops:4.2f} tflops"
|
||||
return profile
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
add_arguments(parser)
|
||||
args = parser.parse_args()
|
||||
|
||||
for (b1, b2, m, k, n, data_type, onnx_file) in create_benchmark_cases(args.precision):
|
||||
time = benchmark_matmul(b1, b2, m, k, n, data_type, onnx_file, args)
|
||||
tflops = b1 * b2 * m * k * n * 2 / time / 1000000000
|
||||
print(f"(b1 b2 m k n) = ({b1} {b2} {m} {k} {n}), {time:7.4f} ms, {tflops:4.2f} tflops")
|
||||
bm = BenchmarkMatMul(args)
|
||||
bm.benchmark()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
Loading…
Reference in a new issue