Add micro-benchmarks for Attention and SkipLayerNormalization ops. (#10798)

* Add micro-benchmarks for Attention and SkipLayerNormalization ops.

* Add choices for argument provider and precision.

* Automatically select CUDA or ROCM execution provider.
This commit is contained in:
zhangyaobit 2022-03-09 18:18:51 -08:00 committed by GitHub
parent 1c313f4476
commit 9cbcc93e03
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 309 additions and 25 deletions

View file

@ -0,0 +1,52 @@
import argparse
from dataclasses import dataclass
import numpy as np
from benchmark import BenchmarkOp, add_arguments
@dataclass
class OpParam:
batch_size: int
seq_len: int
hidden_size: int
length: int
data_type: type
class BenchmarkAttention(BenchmarkOp):
def __init__(self, args):
BenchmarkOp.__init__(self, args)
def create_inputs_outputs(cls, op_param):
np.random.seed(0)
input_data = np.random.rand(op_param.batch_size, op_param.seq_len, op_param.hidden_size).astype(op_param.data_type)
weight = np.random.rand(op_param.hidden_size, op_param.length).astype(op_param.data_type)
bias = np.random.rand(op_param.length).astype(op_param.data_type)
mask_index = np.random.rand(op_param.batch_size, op_param.seq_len).astype(np.int32)
output_data = np.random.rand(op_param.batch_size, op_param.seq_len, op_param.hidden_size).astype(op_param.data_type)
inputs = {"INPUT": input_data, "WEIGHT": weight, "BIAS": bias, "MASK_INDEX": mask_index}
outputs = {"return_val": output_data}
return inputs, outputs
def create_cases(self):
model = "models/attention_fp16.onnx" if self.args.precision == "fp16" else "models/attention_fp32.onnx"
data_type = np.float16 if self.args.precision == "fp16" else np.float32
# bert-base
op_param = OpParam(1, 384, 768, 768 * 3, data_type)
self.add_case(op_param, model)
def case_profile(cls, op_param, time):
profile = f"(batch_size seq_len length) = ({op_param.batch_size} {op_param.seq_len} {op_param.length}), {time:7.4f} ms"
return profile
def main():
parser = argparse.ArgumentParser()
add_arguments(parser)
args = parser.parse_args()
bm = BenchmarkAttention(args)
bm.benchmark()
if __name__ == "__main__":
main()

View file

@ -1,63 +1,89 @@
from abc import ABC, abstractmethod
from argparse import ArgumentParser
import time
import logging
import numpy
import onnxruntime as ort
import time
import torch
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def numpy_type(torch_type):
type_map = {torch.float32: numpy.float32,
torch.float16: numpy.float16}
torch.float16: numpy.float16,
torch.int32: numpy.int32}
return type_map[torch_type]
def add_arguments(parser: ArgumentParser):
parser.add_argument("--provider", required=False, type=str, default="rocm", help="Execution provider to use")
parser.add_argument("--precision", required=False, type=str, default="fp16", help="Number format to use")
parser.add_argument('--profiling', type=bool, default=False, help='If enable profiling')
parser.add_argument("--provider", required=False, type=str,
choices=["cuda", "rocm", "cpu", None], default=None,
help=("Execution provider to use. By default, a "
"provider is selected in the priority order "
"(cuda|rocm, cpu) depending on availability."))
parser.add_argument("--precision", required=False, type=str,
choices=["fp16", "fp32"], default="fp16",
help="Number format to use")
parser.add_argument('--profiling', required=False, type=bool,
default=False, help='If enable profiling')
def provider_name(name):
provider_map = {"cuda": "CUDAExecutionProvider",
"rocm": "ROCMExecutionProvider",
"cpu": "CPUExecutionProvider"}
return provider_map[name]
def get_default_provider():
if "CUDAExecutionProvider" in ort.get_available_providers():
return "CUDAExecutionProvider"
if "ROCMExecutionProvider" in ort.get_available_providers():
return "ROCMExecutionProvider"
return "CPUExecutionProvider"
class Benchmark:
def __init__(self, model, inputs, outputs, args):
self.provider = args.provider
self.provider = (get_default_provider() if args.provider == None
else provider_name(args.provider))
logger.info(f"Execution provider: {self.provider}")
self.profiling = args.profiling
self.model = model
logger.info(f"Model: {self.model}")
self.inputs = inputs
self.outputs = outputs
def create_input_output_tensors(self):
device = "cuda"
input_tensors = {name: torch.from_numpy(array).to(device) for name, array in self.inputs.items()}
output_tensors = {name: torch.from_numpy(array).to(device) for name, array in self.outputs.items()}
on_gpu = (self.provider == "CUDAExecutionProvider"
or self.provider == "ROCMExecutionProvider")
device = "cuda" if on_gpu else "cpu"
input_tensors = {name: torch.from_numpy(array).to(device)
for name, array in self.inputs.items()}
output_tensors = {name: torch.from_numpy(array).to(device)
for name, array in self.outputs.items()}
return input_tensors, output_tensors
@classmethod
def create_io_binding(cls, sess, input_tensors, output_tensors):
io_binding = sess.io_binding()
for name, tensor in input_tensors.items():
io_binding.bind_input(name, tensor.device.type, 0, numpy_type(tensor.dtype), tensor.shape, tensor.data_ptr())
io_binding.bind_input(name, tensor.device.type, 0,
numpy_type(tensor.dtype), tensor.shape,
tensor.data_ptr())
for name, tensor in output_tensors.items():
io_binding.bind_output(name, tensor.device.type, 0, numpy_type(tensor.dtype), tensor.shape, tensor.data_ptr())
io_binding.bind_output(name, tensor.device.type, 0,
numpy_type(tensor.dtype), tensor.shape,
tensor.data_ptr())
return io_binding
def create_session(self):
sess_opt = ort.SessionOptions()
sess_opt.enable_profiling = self.profiling
if self.provider == "rocm":
execution_provider = ["ROCMExecutionProvider"]
elif self.provider == "cuda":
execution_provider = ["CUDAExecutionProvider"]
else:
raise ValueError(f"The script doesn't support provider type '{self.provider}' yet.")
sess = ort.InferenceSession(self.model, sess_options=sess_opt, providers=execution_provider)
if self.provider == "rocm":
assert 'ROCMExecutionProvider' in sess.get_providers()
elif self.provider == "cuda":
assert 'CUDAExecutionProvider' in sess.get_providers()
sess = ort.InferenceSession(self.model, sess_options=sess_opt,
providers=[self.provider])
return sess
def benchmark(self):

View file

@ -0,0 +1,40 @@
p2o:Ë
Y
INPUT
WEIGHT
BIAS
MASK_INDEX
return_val" Attention*
num_heads  : com.microsoft attentionZ7
INPUT.
,
(

batch_size
seq_len
 hidden_sizeZ&
WEIGHT


 hidden_size
lenZ
BIAS


lenZ-
MASK_INDEX


batch_size
seq_lenb<
return_val.
,
(

batch_size
seq_len
 hidden_sizeB
com.microsoft

View file

@ -0,0 +1,36 @@
p2o:Ë
Y
INPUT
WEIGHT
BIAS
MASK_INDEX
return_val" Attention*
num_heads  : com.microsoft attentionZ7
INPUT.
,(

batch_size
seq_len
 hidden_sizeZ&
WEIGHT

 hidden_size
lenZ
BIAS

lenZ-
MASK_INDEX


batch_size
seq_lenb<
return_val.
,(

batch_size
seq_len
 hidden_sizeB
com.microsoft

View file

@ -0,0 +1,41 @@
p2o:þ
f
INPUT
SKIP
GAMMA
BETA
BIAS
return_val"SkipLayerNormalization*
epsilonoƒ: : com.microsoftskip_layer_normalizationZ2
INPUT)
'
#
batch
seq_len
 hidden_sizeZ1
SKIP)
'
#
batch
seq_len
 hidden_sizeZ
GAMMA


 hidden_sizeZ
BETA


 hidden_sizeZ
BIAS


 hidden_sizeb5
return_val'
%
!
batch
seq_len
 inter_dimB
com.microsoft

View file

@ -0,0 +1,35 @@
p2o:þ
f
INPUT
SKIP
GAMMA
BETA
BIAS
return_val"SkipLayerNormalization*
epsilonoƒ: : com.microsoftskip_layer_normalizationZ2
INPUT)
'#
batch
seq_len
 hidden_sizeZ1
SKIP)
'#
batch
seq_len
 hidden_sizeZ
GAMMA

 hidden_sizeZ
BETA

 hidden_sizeZ
BIAS

 hidden_sizeb5
return_val'
%!
batch
seq_len
 inter_dimB
com.microsoft

View file

@ -0,0 +1,54 @@
import argparse
from dataclasses import dataclass
import numpy as np
from benchmark import BenchmarkOp, add_arguments
@dataclass
class OpParam:
batch_size: int
seq_len: int
hidden_size: int
data_type: type
class BenchmarkSkipLayerNorm(BenchmarkOp):
def __init__(self, args):
BenchmarkOp.__init__(self, args)
def create_inputs_outputs(cls, op_param):
np.random.seed(0)
input_data = np.random.rand(op_param.batch_size, op_param.seq_len, op_param.hidden_size).astype(op_param.data_type)
skip = np.random.rand(op_param.batch_size, op_param.seq_len, op_param.hidden_size).astype(op_param.data_type)
gamma = np.random.rand(op_param.hidden_size).astype(op_param.data_type)
beta = np.random.rand(op_param.hidden_size).astype(op_param.data_type)
bias = np.random.rand(op_param.hidden_size).astype(op_param.data_type)
output_data = np.random.rand(op_param.batch_size, op_param.seq_len, op_param.hidden_size).astype(op_param.data_type)
inputs = {"INPUT": input_data, "SKIP": skip, "GAMMA": gamma, "BETA": beta, "BIAS": bias}
outputs = {"return_val": output_data}
return inputs, outputs
def create_cases(self):
model = "models/skip_layer_norm_fp16.onnx" if self.args.precision == "fp16" else "models/skip_layer_norm_fp32.onnx"
data_type = np.float16 if self.args.precision == "fp16" else np.float32
# bert-large
op_param = OpParam(1, 384, 1024, data_type)
self.add_case(op_param, model)
def case_profile(cls, op_param, time):
profile = f"(batch seq_len hidden_size) = ({op_param.batch_size} {op_param.seq_len} {op_param.hidden_size}), {time:7.4f} ms"
return profile
def main():
parser = argparse.ArgumentParser()
add_arguments(parser)
args = parser.parse_args()
bm = BenchmarkSkipLayerNorm(args)
bm.benchmark()
if __name__ == "__main__":
main()