Refactor the common code per operator into an abstract base class. (#10785)

This commit is contained in:
zhangyaobit 2022-03-07 13:15:49 -08:00 committed by GitHub
parent a08036da09
commit b7f00b9682
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 176 additions and 145 deletions

View file

@ -1,3 +1,4 @@
from abc import ABC, abstractmethod
from argparse import ArgumentParser
import time
import numpy
@ -5,72 +6,106 @@ import onnxruntime as ort
import torch
def add_arguments(parser: ArgumentParser):
parser.add_argument("--provider", required=False, type=str, default="rocm", help="Execution provider to use")
parser.add_argument("--precision", required=False, type=str, default="fp16", help="Number format to use")
parser.add_argument('--profiling', type=bool, default=False, help='If enable profiling')
def create_input_output_tensors(inputs, outputs):
device = "cuda"
input_tensors = {name: torch.from_numpy(array).to(device) for name, array in inputs.items()}
output_tensors = {name: torch.from_numpy(array).to(device) for name, array in outputs.items()}
return input_tensors, output_tensors
def numpy_type(torch_type):
type_map = {torch.float32: numpy.float32,
torch.float16: numpy.float16}
return type_map[torch_type]
def create_io_binding(sess, input_tensors, output_tensors):
io_binding = sess.io_binding()
for name, tensor in input_tensors.items():
io_binding.bind_input(name, tensor.device.type, 0, numpy_type(tensor.dtype), tensor.shape, tensor.data_ptr())
for name, tensor in output_tensors.items():
io_binding.bind_output(name, tensor.device.type, 0, numpy_type(tensor.dtype), tensor.shape, tensor.data_ptr())
return io_binding
def add_arguments(parser: ArgumentParser):
parser.add_argument("--provider", required=False, type=str, default="rocm", help="Execution provider to use")
parser.add_argument("--precision", required=False, type=str, default="fp16", help="Number format to use")
parser.add_argument('--profiling', type=bool, default=False, help='If enable profiling')
def create_session(onnx_file, args):
sess_opt = ort.SessionOptions()
sess_opt.enable_profiling = args.profiling
if args.provider == "rocm":
execution_provider = ["ROCMExecutionProvider"]
elif args.provider == "cuda":
execution_provider = ["CUDAExecutionProvider"]
else:
raise ValueError(f"The script doesn't support provider type '{args.provider}' yet.")
class Benchmark:
def __init__(self, model, inputs, outputs, args):
self.provider = args.provider
self.profiling = args.profiling
self.model = model
self.inputs = inputs
self.outputs = outputs
sess = ort.InferenceSession(onnx_file, sess_options=sess_opt, providers=execution_provider)
def create_input_output_tensors(self):
device = "cuda"
input_tensors = {name: torch.from_numpy(array).to(device) for name, array in self.inputs.items()}
output_tensors = {name: torch.from_numpy(array).to(device) for name, array in self.outputs.items()}
return input_tensors, output_tensors
if args.provider == "rocm":
assert 'ROCMExecutionProvider' in sess.get_providers()
elif args.provider == "cuda":
assert 'CUDAExecutionProvider' in sess.get_providers()
@classmethod
def create_io_binding(cls, sess, input_tensors, output_tensors):
io_binding = sess.io_binding()
for name, tensor in input_tensors.items():
io_binding.bind_input(name, tensor.device.type, 0, numpy_type(tensor.dtype), tensor.shape, tensor.data_ptr())
for name, tensor in output_tensors.items():
io_binding.bind_output(name, tensor.device.type, 0, numpy_type(tensor.dtype), tensor.shape, tensor.data_ptr())
return io_binding
return sess
def create_session(self):
sess_opt = ort.SessionOptions()
sess_opt.enable_profiling = self.profiling
if self.provider == "rocm":
execution_provider = ["ROCMExecutionProvider"]
elif self.provider == "cuda":
execution_provider = ["CUDAExecutionProvider"]
else:
raise ValueError(f"The script doesn't support provider type '{self.provider}' yet.")
def benchmark(onnx_file, inputs, outputs, args):
sess = create_session(onnx_file, args)
input_tensors, output_tensors = create_input_output_tensors(inputs, outputs)
io_binding = create_io_binding(sess, input_tensors, output_tensors)
sess = ort.InferenceSession(self.model, sess_options=sess_opt, providers=execution_provider)
# warm up
for iter in range(10):
sess.run_with_iobinding(io_binding)
if self.provider == "rocm":
assert 'ROCMExecutionProvider' in sess.get_providers()
elif self.provider == "cuda":
assert 'CUDAExecutionProvider' in sess.get_providers()
return sess
def benchmark(self):
sess = self.create_session()
input_tensors, output_tensors = self.create_input_output_tensors()
io_binding = self.create_io_binding(sess, input_tensors, output_tensors)
# measure
max_iters = 100
start_time = time.time()
for iter in range(max_iters):
sess.run_with_iobinding(io_binding)
# time is in milliseconds
elapsed_time = (time.time() - start_time) * 1000 / max_iters
return elapsed_time
# warm up
for iter in range(10):
sess.run_with_iobinding(io_binding)
# measure
max_iters = 100
start_time = time.time()
for iter in range(max_iters):
sess.run_with_iobinding(io_binding)
# time is in milliseconds
elapsed_time = (time.time() - start_time) * 1000 / max_iters
return elapsed_time
class BenchmarkOp(ABC):
def __init__(self, args):
self.args = args
self.cases = []
@classmethod
@abstractmethod
def create_inputs_outputs(cls, op_param):
...
def add_case(self, op_param, model):
self.cases += [(op_param, model)]
@abstractmethod
def create_cases(self):
...
@classmethod
@abstractmethod
def case_profile(cls, op_param, time):
...
def benchmark(self):
self.create_cases()
for op_param, model in self.cases:
inputs, outputs = self.create_inputs_outputs(op_param)
bm = Benchmark(model, inputs, outputs, self.args)
time = bm.benchmark()
print(self.case_profile(op_param, time))

View file

@ -1,59 +1,57 @@
import argparse
from dataclasses import dataclass
import numpy as np
from benchmark import benchmark, add_arguments
from benchmark import BenchmarkOp, add_arguments
def create_inputs_outputs(batch, seq_len, intermediate_dimension, data_type):
np.random.seed(0)
a = np.random.rand(batch, seq_len, intermediate_dimension).astype(data_type)
b = np.random.rand(intermediate_dimension).astype(data_type)
c = np.random.rand(batch, seq_len, intermediate_dimension).astype(data_type)
inputs = {"A": a, "B": b}
outputs = {"return_val": c}
return inputs, outputs
@dataclass
class OpParam:
dim1: int
dim2: int
dim3: int
data_type: type
def add_benchmark_case(benchmark_cases, batch_size, seq_len, intermediate_dimension, data_type, model):
benchmark_cases += [
(batch_size, seq_len, intermediate_dimension, data_type, model),
]
@dataclass
class ModelParam:
batch_size: int
seq_len: int
inter_dim: int
data_type: type
def create_benchmark_cases(precision):
benchmark_cases = []
if precision == "fp16":
model = "models/fast_gelu_fp16.onnx"
data_type = np.float16
else:
model = "models/fast_gelu_fp32.onnx"
data_type = np.float32
class BenchmarkFastGelu(BenchmarkOp):
def __init__(self, args):
BenchmarkOp.__init__(self, args)
# bert-large
hidden_size = 1024
seq_len = 384
batch_size = 1
intermediate_dimension = hidden_size * 4
add_benchmark_case(benchmark_cases, batch_size, seq_len, intermediate_dimension, data_type, model)
def create_inputs_outputs(cls, op_param):
np.random.seed(0)
a = np.random.rand(op_param.dim1, op_param.dim2, op_param.dim3).astype(op_param.data_type)
b = np.random.rand(op_param.dim3).astype(op_param.data_type)
c = np.random.rand(op_param.dim1, op_param.dim2, op_param.dim3).astype(op_param.data_type)
inputs = {"A": a, "B": b}
outputs = {"return_val": c}
return inputs, outputs
return benchmark_cases
def create_cases(self):
model = "models/fast_gelu_fp16.onnx" if self.args.precision == "fp16" else "models/fast_gelu_fp32.onnx"
data_type = np.float16 if self.args.precision == "fp16" else np.float32
# bert-large
model_param = ModelParam(1, 384, 1024 * 4, data_type)
op_param = OpParam(model_param.batch_size, model_param.seq_len, model_param.inter_dim, model_param.data_type)
self.add_case(op_param, model)
def benchmark_fast_gelu(batch, seq_len, intermediate_dimension, data_type, onnx_file, args):
inputs, outputs = create_inputs_outputs(batch, seq_len, intermediate_dimension, data_type)
time = benchmark(onnx_file, inputs, outputs, args)
return time
def case_profile(cls, op_param, time):
profile = f"(dim1 dim2 dim3) = ({op_param.dim1} {op_param.dim2} {op_param.dim3}), {time:7.4f} ms"
return profile
def main():
parser = argparse.ArgumentParser()
add_arguments(parser)
args = parser.parse_args()
for (batch, seq_len, intermediate_dimension, data_type, onnx_file) in create_benchmark_cases(args.precision):
time = benchmark_fast_gelu(batch, seq_len, intermediate_dimension, data_type, onnx_file, args)
print(f"(batch seq_len inter_dim) = ({batch} {seq_len} {intermediate_dimension}), {time:7.4f} ms")
bm = BenchmarkFastGelu(args)
bm.benchmark()
if __name__ == "__main__":

View file

@ -1,72 +1,70 @@
import argparse
from dataclasses import dataclass
import numpy as np
from benchmark import benchmark, add_arguments
from benchmark import BenchmarkOp, add_arguments
def create_inputs_outputs(b1, b2, m, k, n, data_type):
np.random.seed(0)
a = np.random.rand(b1, b2, m, k).astype(data_type)
b = np.random.rand(b1, b2, k, n).astype(data_type)
c = np.random.rand(b1, b2, m, n).astype(data_type)
inputs = {"A": a, "B": b}
outputs = {"return_val": c}
return inputs, outputs
@dataclass
class OpParam:
b1: int
b2: int
m: int
k: int
n: int
data_type: type
def add_benchmark_case(benchmark_cases, batch_size, seq_len, hidden_size, intermediate_dimension, num_heads, data_type, model):
benchmark_cases += [
(1, batch_size, seq_len, hidden_size, hidden_size, data_type, model),
(1, batch_size, seq_len, intermediate_dimension, hidden_size, data_type, model),
(1, batch_size, seq_len, hidden_size, intermediate_dimension, data_type, model),
(batch_size, num_heads, seq_len, seq_len, int(hidden_size / num_heads), data_type, model),
]
@dataclass
class ModelParam:
batch_size: int
seq_len: int
hidden_size: int
inter_dim: int
num_heads: int
data_type: type
def create_benchmark_cases(precision):
benchmark_cases = []
if precision == "fp16":
model = "models/matmul_fp16.onnx"
data_type = np.float16
else:
model = "models/matmul_fp32.onnx"
data_type = np.float32
class BenchmarkMatMul(BenchmarkOp):
def __init__(self, args):
BenchmarkOp.__init__(self, args)
# bert-large
hidden_size = 1024
seq_len = 384
num_heads = 16
batch_size = 1
intermediate_dimension = hidden_size * 4
add_benchmark_case(benchmark_cases, batch_size, seq_len, hidden_size, intermediate_dimension, num_heads, data_type, model)
def create_inputs_outputs(cls, op_param):
np.random.seed(0)
a = np.random.rand(op_param.b1, op_param.b2, op_param.m, op_param.k).astype(op_param.data_type)
b = np.random.rand(op_param.b1, op_param.b2, op_param.k, op_param.n).astype(op_param.data_type)
c = np.random.rand(op_param.b1, op_param.b2, op_param.m, op_param.n).astype(op_param.data_type)
inputs = {"A": a, "B": b}
outputs = {"return_val": c}
return inputs, outputs
# bert-base
hidden_size = 768
seq_len = 384
num_heads = 12
batch_size = 1
intermediate_dimension = hidden_size * 4
add_benchmark_case(benchmark_cases, batch_size, seq_len, hidden_size, intermediate_dimension, num_heads, data_type, model)
def add_model_cases(self, mp, model):
self.add_case(OpParam(1, mp.batch_size, mp.seq_len, mp.hidden_size, mp.hidden_size, mp.data_type), model)
self.add_case(OpParam(1, mp.batch_size, mp.seq_len, mp.inter_dim, mp.hidden_size, mp.data_type), model)
self.add_case(OpParam(1, mp.batch_size, mp.seq_len, mp.hidden_size, mp.inter_dim, mp.data_type), model)
self.add_case(OpParam(mp.batch_size, mp.num_heads, mp.seq_len, mp.seq_len, int(mp.hidden_size / mp.num_heads), mp.data_type), model)
return benchmark_cases
def create_cases(self):
model = "models/matmul_fp16.onnx" if self.args.precision == "fp16" else "models/matmul_fp32.onnx"
data_type = np.float16 if self.args.precision == "fp16" else np.float32
# bert-large
model_param = ModelParam(1, 384, 1024, 1024 * 4, 16, data_type)
self.add_model_cases(model_param, model)
# bert-base
model_param = ModelParam(1, 384, 768, 768 * 4, 12, data_type)
self.add_model_cases(model_param, model)
def benchmark_matmul(b1, b2, m, k, n, data_type, onnx_file, args):
inputs, outputs = create_inputs_outputs(b1, b2, m, k, n, data_type)
time = benchmark(onnx_file, inputs, outputs, args)
return time
def case_profile(cls, op_param, time):
tflops = op_param.b1 * op_param.b2 * op_param.m * op_param.k * op_param.n * 2 / time / 1000000000
profile = f"(b1 b2 m k n) = ({op_param.b1} {op_param.b2} {op_param.m} {op_param.k} {op_param.n}), {time:7.4f} ms, {tflops:4.2f} tflops"
return profile
def main():
parser = argparse.ArgumentParser()
add_arguments(parser)
args = parser.parse_args()
for (b1, b2, m, k, n, data_type, onnx_file) in create_benchmark_cases(args.precision):
time = benchmark_matmul(b1, b2, m, k, n, data_type, onnx_file, args)
tflops = b1 * b2 * m * k * n * 2 / time / 1000000000
print(f"(b1 b2 m k n) = ({b1} {b2} {m} {k} {n}), {time:7.4f} ms, {tflops:4.2f} tflops")
bm = BenchmarkMatMul(args)
bm.benchmark()
if __name__ == "__main__":