mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-06-07 00:13:17 +00:00
Add micro-benchmarks for Attention and SkipLayerNormalization ops. (#10798)
* Add micro-benchmarks for Attention and SkipLayerNormalization ops. * Add choices for argument provider and precision. * Automatically select CUDA or ROCM execution provider.
This commit is contained in:
parent
1c313f4476
commit
9cbcc93e03
7 changed files with 309 additions and 25 deletions
52
onnxruntime/python/tools/microbench/attention.py
Normal file
52
onnxruntime/python/tools/microbench/attention.py
Normal file
|
|
@ -0,0 +1,52 @@
|
|||
import argparse
|
||||
from dataclasses import dataclass
|
||||
import numpy as np
|
||||
from benchmark import BenchmarkOp, add_arguments
|
||||
|
||||
|
||||
@dataclass
|
||||
class OpParam:
|
||||
batch_size: int
|
||||
seq_len: int
|
||||
hidden_size: int
|
||||
length: int
|
||||
data_type: type
|
||||
|
||||
|
||||
class BenchmarkAttention(BenchmarkOp):
|
||||
def __init__(self, args):
|
||||
BenchmarkOp.__init__(self, args)
|
||||
|
||||
def create_inputs_outputs(cls, op_param):
|
||||
np.random.seed(0)
|
||||
input_data = np.random.rand(op_param.batch_size, op_param.seq_len, op_param.hidden_size).astype(op_param.data_type)
|
||||
weight = np.random.rand(op_param.hidden_size, op_param.length).astype(op_param.data_type)
|
||||
bias = np.random.rand(op_param.length).astype(op_param.data_type)
|
||||
mask_index = np.random.rand(op_param.batch_size, op_param.seq_len).astype(np.int32)
|
||||
output_data = np.random.rand(op_param.batch_size, op_param.seq_len, op_param.hidden_size).astype(op_param.data_type)
|
||||
inputs = {"INPUT": input_data, "WEIGHT": weight, "BIAS": bias, "MASK_INDEX": mask_index}
|
||||
outputs = {"return_val": output_data}
|
||||
return inputs, outputs
|
||||
|
||||
def create_cases(self):
|
||||
model = "models/attention_fp16.onnx" if self.args.precision == "fp16" else "models/attention_fp32.onnx"
|
||||
data_type = np.float16 if self.args.precision == "fp16" else np.float32
|
||||
# bert-base
|
||||
op_param = OpParam(1, 384, 768, 768 * 3, data_type)
|
||||
self.add_case(op_param, model)
|
||||
|
||||
def case_profile(cls, op_param, time):
|
||||
profile = f"(batch_size seq_len length) = ({op_param.batch_size} {op_param.seq_len} {op_param.length}), {time:7.4f} ms"
|
||||
return profile
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
add_arguments(parser)
|
||||
args = parser.parse_args()
|
||||
bm = BenchmarkAttention(args)
|
||||
bm.benchmark()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -1,63 +1,89 @@
|
|||
from abc import ABC, abstractmethod
|
||||
from argparse import ArgumentParser
|
||||
import time
|
||||
import logging
|
||||
import numpy
|
||||
import onnxruntime as ort
|
||||
import time
|
||||
import torch
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def numpy_type(torch_type):
|
||||
type_map = {torch.float32: numpy.float32,
|
||||
torch.float16: numpy.float16}
|
||||
torch.float16: numpy.float16,
|
||||
torch.int32: numpy.int32}
|
||||
return type_map[torch_type]
|
||||
|
||||
|
||||
def add_arguments(parser: ArgumentParser):
|
||||
parser.add_argument("--provider", required=False, type=str, default="rocm", help="Execution provider to use")
|
||||
parser.add_argument("--precision", required=False, type=str, default="fp16", help="Number format to use")
|
||||
parser.add_argument('--profiling', type=bool, default=False, help='If enable profiling')
|
||||
parser.add_argument("--provider", required=False, type=str,
|
||||
choices=["cuda", "rocm", "cpu", None], default=None,
|
||||
help=("Execution provider to use. By default, a "
|
||||
"provider is selected in the priority order "
|
||||
"(cuda|rocm, cpu) depending on availability."))
|
||||
parser.add_argument("--precision", required=False, type=str,
|
||||
choices=["fp16", "fp32"], default="fp16",
|
||||
help="Number format to use")
|
||||
parser.add_argument('--profiling', required=False, type=bool,
|
||||
default=False, help='If enable profiling')
|
||||
|
||||
|
||||
def provider_name(name):
|
||||
provider_map = {"cuda": "CUDAExecutionProvider",
|
||||
"rocm": "ROCMExecutionProvider",
|
||||
"cpu": "CPUExecutionProvider"}
|
||||
return provider_map[name]
|
||||
|
||||
|
||||
def get_default_provider():
|
||||
if "CUDAExecutionProvider" in ort.get_available_providers():
|
||||
return "CUDAExecutionProvider"
|
||||
if "ROCMExecutionProvider" in ort.get_available_providers():
|
||||
return "ROCMExecutionProvider"
|
||||
return "CPUExecutionProvider"
|
||||
|
||||
|
||||
class Benchmark:
|
||||
def __init__(self, model, inputs, outputs, args):
|
||||
self.provider = args.provider
|
||||
self.provider = (get_default_provider() if args.provider == None
|
||||
else provider_name(args.provider))
|
||||
logger.info(f"Execution provider: {self.provider}")
|
||||
self.profiling = args.profiling
|
||||
self.model = model
|
||||
logger.info(f"Model: {self.model}")
|
||||
self.inputs = inputs
|
||||
self.outputs = outputs
|
||||
|
||||
def create_input_output_tensors(self):
|
||||
device = "cuda"
|
||||
input_tensors = {name: torch.from_numpy(array).to(device) for name, array in self.inputs.items()}
|
||||
output_tensors = {name: torch.from_numpy(array).to(device) for name, array in self.outputs.items()}
|
||||
on_gpu = (self.provider == "CUDAExecutionProvider"
|
||||
or self.provider == "ROCMExecutionProvider")
|
||||
device = "cuda" if on_gpu else "cpu"
|
||||
input_tensors = {name: torch.from_numpy(array).to(device)
|
||||
for name, array in self.inputs.items()}
|
||||
output_tensors = {name: torch.from_numpy(array).to(device)
|
||||
for name, array in self.outputs.items()}
|
||||
return input_tensors, output_tensors
|
||||
|
||||
@classmethod
|
||||
def create_io_binding(cls, sess, input_tensors, output_tensors):
|
||||
io_binding = sess.io_binding()
|
||||
for name, tensor in input_tensors.items():
|
||||
io_binding.bind_input(name, tensor.device.type, 0, numpy_type(tensor.dtype), tensor.shape, tensor.data_ptr())
|
||||
io_binding.bind_input(name, tensor.device.type, 0,
|
||||
numpy_type(tensor.dtype), tensor.shape,
|
||||
tensor.data_ptr())
|
||||
for name, tensor in output_tensors.items():
|
||||
io_binding.bind_output(name, tensor.device.type, 0, numpy_type(tensor.dtype), tensor.shape, tensor.data_ptr())
|
||||
io_binding.bind_output(name, tensor.device.type, 0,
|
||||
numpy_type(tensor.dtype), tensor.shape,
|
||||
tensor.data_ptr())
|
||||
return io_binding
|
||||
|
||||
def create_session(self):
|
||||
sess_opt = ort.SessionOptions()
|
||||
sess_opt.enable_profiling = self.profiling
|
||||
if self.provider == "rocm":
|
||||
execution_provider = ["ROCMExecutionProvider"]
|
||||
elif self.provider == "cuda":
|
||||
execution_provider = ["CUDAExecutionProvider"]
|
||||
else:
|
||||
raise ValueError(f"The script doesn't support provider type '{self.provider}' yet.")
|
||||
|
||||
sess = ort.InferenceSession(self.model, sess_options=sess_opt, providers=execution_provider)
|
||||
|
||||
if self.provider == "rocm":
|
||||
assert 'ROCMExecutionProvider' in sess.get_providers()
|
||||
elif self.provider == "cuda":
|
||||
assert 'CUDAExecutionProvider' in sess.get_providers()
|
||||
|
||||
sess = ort.InferenceSession(self.model, sess_options=sess_opt,
|
||||
providers=[self.provider])
|
||||
return sess
|
||||
|
||||
def benchmark(self):
|
||||
|
|
|
|||
|
|
@ -0,0 +1,40 @@
|
|||
p2o:Ë
|
||||
Y
|
||||
INPUT
|
||||
WEIGHT
|
||||
BIAS
|
||||
|
||||
MASK_INDEX
|
||||
return_val" Attention*
|
||||
num_heads :
com.microsoft attentionZ7
|
||||
INPUT.
|
||||
,
|
||||
(
|
||||
|
||||
batch_size
|
||||
seq_len
|
||||
hidden_sizeZ&
|
||||
WEIGHT
|
||||
|
||||
|
||||
hidden_size
|
||||
lenZ
|
||||
BIAS
|
||||
|
||||
|
||||
lenZ-
|
||||
|
||||
MASK_INDEX
|
||||
|
||||
|
||||
batch_size
|
||||
seq_lenb<
|
||||
|
||||
return_val.
|
||||
,
|
||||
(
|
||||
|
||||
batch_size
|
||||
seq_len
|
||||
hidden_sizeB
|
||||
com.microsoft
|
||||
|
|
@ -0,0 +1,36 @@
|
|||
p2o:Ë
|
||||
Y
|
||||
INPUT
|
||||
WEIGHT
|
||||
BIAS
|
||||
|
||||
MASK_INDEX
|
||||
return_val" Attention*
|
||||
num_heads :
com.microsoft attentionZ7
|
||||
INPUT.
|
||||
,(
|
||||
|
||||
batch_size
|
||||
seq_len
|
||||
hidden_sizeZ&
|
||||
WEIGHT
|
||||
|
||||
hidden_size
|
||||
lenZ
|
||||
BIAS
|
||||
|
||||
lenZ-
|
||||
|
||||
MASK_INDEX
|
||||
|
||||
|
||||
batch_size
|
||||
seq_lenb<
|
||||
|
||||
return_val.
|
||||
,(
|
||||
|
||||
batch_size
|
||||
seq_len
|
||||
hidden_sizeB
|
||||
com.microsoft
|
||||
|
|
@ -0,0 +1,41 @@
|
|||
p2o:þ
|
||||
f
|
||||
INPUT
|
||||
SKIP
|
||||
GAMMA
|
||||
BETA
|
||||
BIAS
|
||||
return_val"SkipLayerNormalization*
|
||||
epsilonoƒ: :
com.microsoftskip_layer_normalizationZ2
|
||||
INPUT)
|
||||
'
|
||||
#
|
||||
batch
|
||||
seq_len
|
||||
hidden_sizeZ1
|
||||
SKIP)
|
||||
'
|
||||
#
|
||||
batch
|
||||
seq_len
|
||||
hidden_sizeZ
|
||||
GAMMA
|
||||
|
||||
|
||||
hidden_sizeZ
|
||||
BETA
|
||||
|
||||
|
||||
hidden_sizeZ
|
||||
BIAS
|
||||
|
||||
|
||||
hidden_sizeb5
|
||||
|
||||
return_val'
|
||||
%
|
||||
!
|
||||
batch
|
||||
seq_len
|
||||
inter_dimB
|
||||
com.microsoft
|
||||
|
|
@ -0,0 +1,35 @@
|
|||
p2o:þ
|
||||
f
|
||||
INPUT
|
||||
SKIP
|
||||
GAMMA
|
||||
BETA
|
||||
BIAS
|
||||
return_val"SkipLayerNormalization*
|
||||
epsilonoƒ: :
com.microsoftskip_layer_normalizationZ2
|
||||
INPUT)
|
||||
'#
|
||||
batch
|
||||
seq_len
|
||||
hidden_sizeZ1
|
||||
SKIP)
|
||||
'#
|
||||
batch
|
||||
seq_len
|
||||
hidden_sizeZ
|
||||
GAMMA
|
||||
|
||||
hidden_sizeZ
|
||||
BETA
|
||||
|
||||
hidden_sizeZ
|
||||
BIAS
|
||||
|
||||
hidden_sizeb5
|
||||
|
||||
return_val'
|
||||
%!
|
||||
batch
|
||||
seq_len
|
||||
inter_dimB
|
||||
com.microsoft
|
||||
54
onnxruntime/python/tools/microbench/skip_layer_norm.py
Normal file
54
onnxruntime/python/tools/microbench/skip_layer_norm.py
Normal file
|
|
@ -0,0 +1,54 @@
|
|||
import argparse
|
||||
from dataclasses import dataclass
|
||||
import numpy as np
|
||||
from benchmark import BenchmarkOp, add_arguments
|
||||
|
||||
|
||||
@dataclass
|
||||
class OpParam:
|
||||
batch_size: int
|
||||
seq_len: int
|
||||
hidden_size: int
|
||||
data_type: type
|
||||
|
||||
|
||||
class BenchmarkSkipLayerNorm(BenchmarkOp):
|
||||
def __init__(self, args):
|
||||
BenchmarkOp.__init__(self, args)
|
||||
|
||||
def create_inputs_outputs(cls, op_param):
|
||||
np.random.seed(0)
|
||||
input_data = np.random.rand(op_param.batch_size, op_param.seq_len, op_param.hidden_size).astype(op_param.data_type)
|
||||
skip = np.random.rand(op_param.batch_size, op_param.seq_len, op_param.hidden_size).astype(op_param.data_type)
|
||||
gamma = np.random.rand(op_param.hidden_size).astype(op_param.data_type)
|
||||
beta = np.random.rand(op_param.hidden_size).astype(op_param.data_type)
|
||||
bias = np.random.rand(op_param.hidden_size).astype(op_param.data_type)
|
||||
output_data = np.random.rand(op_param.batch_size, op_param.seq_len, op_param.hidden_size).astype(op_param.data_type)
|
||||
|
||||
inputs = {"INPUT": input_data, "SKIP": skip, "GAMMA": gamma, "BETA": beta, "BIAS": bias}
|
||||
outputs = {"return_val": output_data}
|
||||
|
||||
return inputs, outputs
|
||||
|
||||
def create_cases(self):
|
||||
model = "models/skip_layer_norm_fp16.onnx" if self.args.precision == "fp16" else "models/skip_layer_norm_fp32.onnx"
|
||||
data_type = np.float16 if self.args.precision == "fp16" else np.float32
|
||||
# bert-large
|
||||
op_param = OpParam(1, 384, 1024, data_type)
|
||||
self.add_case(op_param, model)
|
||||
|
||||
def case_profile(cls, op_param, time):
|
||||
profile = f"(batch seq_len hidden_size) = ({op_param.batch_size} {op_param.seq_len} {op_param.hidden_size}), {time:7.4f} ms"
|
||||
return profile
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
add_arguments(parser)
|
||||
args = parser.parse_args()
|
||||
bm = BenchmarkSkipLayerNorm(args)
|
||||
bm.benchmark()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Reference in a new issue