Support specifying an execution provider in benchmark script (#10453)

* Support specifying execution providers.

* Change default provider setting to None.

* Add support for bert_perf_test script.

* Fall back to ROCM/CUDA EP for MIGraphX/Tensorrt EP.

* Assert fall back EPs are included.

* Add model class AutoModelForCausalLM and other minor updates.

Co-authored-by: Yao Zhang <zhanyao@microsoft.com>
This commit is contained in:
zhangyaobit 2022-02-02 19:11:31 -08:00 committed by GitHub
parent a405658370
commit 239c6ad3f0
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 79 additions and 22 deletions

View file

@ -68,13 +68,14 @@ import torch
from transformers import (AutoConfig, AutoTokenizer, AutoModel, GPT2Model, LxmertConfig)
def run_onnxruntime(use_gpu, model_names, model_class, precision, num_threads, batch_sizes, sequence_lengths,
def run_onnxruntime(use_gpu, provider, model_names, model_class, precision, num_threads, batch_sizes, sequence_lengths,
repeat_times, input_counts, optimize_onnx, validate_onnx, cache_dir, onnx_dir, verbose, overwrite,
disable_ort_io_binding, use_raw_attention_mask, model_fusion_statistics, model_source):
import onnxruntime
results = []
if use_gpu and ('CUDAExecutionProvider' not in onnxruntime.get_available_providers()):
if (use_gpu and ('CUDAExecutionProvider' not in onnxruntime.get_available_providers()) and
('ROCMExecutionProvider' not in onnxruntime.get_available_providers())):
logger.error(
"Please install onnxruntime-gpu package instead of onnxruntime, and use a machine with GPU for testing gpu performance."
)
@ -105,6 +106,7 @@ def run_onnxruntime(use_gpu, model_names, model_class, precision, num_threads, b
ort_session = create_onnxruntime_session(onnx_model_file,
use_gpu,
provider,
enable_all_optimization=True,
num_threads=num_threads,
verbose=verbose)
@ -425,7 +427,13 @@ def parse_arguments():
default=os.path.join('.', 'onnx_models'),
help="Directory to store onnx models")
parser.add_argument("-g", "--use_gpu", required=False, action="store_true", help="Run on cuda device")
parser.add_argument("-g", "--use_gpu", required=False, action="store_true", help="Run on gpu device")
parser.add_argument("--provider",
required=False,
type=str,
default=None,
help="Execution provider to use")
parser.add_argument(
"-p",
@ -545,7 +553,7 @@ def main():
if enable_onnxruntime:
try:
use_raw_attention_mask = True
results += run_onnxruntime(args.use_gpu, args.models, args.model_class, args.precision, num_threads,
results += run_onnxruntime(args.use_gpu, args.provider, args.models, args.model_class, args.precision, num_threads,
args.batch_sizes, args.sequence_lengths, args.test_times, args.input_counts,
args.optimize_onnx, args.validate_onnx, args.cache_dir, args.onnx_dir,
args.verbose, args.overwrite, args.disable_ort_io_binding,

View file

@ -39,11 +39,11 @@ IO_BINDING_DATA_TYPE_MAP = {
def create_onnxruntime_session(onnx_model_path,
use_gpu,
provider=None,
enable_all_optimization=True,
num_threads=-1,
enable_profiling=False,
verbose=False,
use_dml=False):
verbose=False):
session = None
try:
from onnxruntime import SessionOptions, InferenceSession, GraphOptimizationLevel, __version__ as onnxruntime_version
@ -68,8 +68,16 @@ def create_onnxruntime_session(onnx_model_path,
logger.debug(f"Create session for onnx model: {onnx_model_path}")
if use_gpu:
if use_dml:
if provider == 'dml':
execution_providers = ['DmlExecutionProvider', 'CPUExecutionProvider']
elif provider == 'rocm':
execution_providers = ['ROCMExecutionProvider', 'CPUExecutionProvider']
elif provider == 'migraphx':
execution_providers = ['MIGraphXExecutionProvider', 'ROCMExecutionProvider', 'CPUExecutionProvider']
elif provider == 'cuda':
execution_providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
elif provider == 'tensorrt':
execution_providers = ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']
else:
execution_providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
else:
@ -89,7 +97,7 @@ def setup_logger(verbose=True):
logging.getLogger("transformers").setLevel(logging.WARNING)
def prepare_environment(cache_dir, output_dir, use_gpu, use_dml=False):
def prepare_environment(cache_dir, output_dir, use_gpu, provider=None):
if cache_dir and not os.path.exists(cache_dir):
os.makedirs(cache_dir)
@ -98,7 +106,7 @@ def prepare_environment(cache_dir, output_dir, use_gpu, use_dml=False):
import onnxruntime
if use_gpu:
if use_dml:
if provider == 'dml':
assert 'DmlExecutionProvider' in onnxruntime.get_available_providers(
), "Please install onnxruntime-directml package to test GPU inference."

View file

@ -36,6 +36,7 @@ class TestSetting:
test_cases: int
test_times: int
use_gpu: bool
provider: str
intra_op_num_threads: int
seed: int
verbose: bool
@ -50,7 +51,7 @@ class ModelSetting:
opt_level: int
def create_session(model_path, use_gpu, intra_op_num_threads, graph_optimization_level=None):
def create_session(model_path, use_gpu, provider, intra_op_num_threads, graph_optimization_level=None):
import onnxruntime
if use_gpu and ('CUDAExecutionProvider' not in onnxruntime.get_available_providers()):
@ -61,8 +62,21 @@ def create_session(model_path, use_gpu, intra_op_num_threads, graph_optimization
if intra_op_num_threads is None and graph_optimization_level is None:
session = onnxruntime.InferenceSession(model_path)
else:
execution_providers = ['CPUExecutionProvider'
] if not use_gpu else ['CUDAExecutionProvider', 'CPUExecutionProvider']
if use_gpu:
if provider == 'dml':
execution_providers = ['DmlExecutionProvider', 'CPUExecutionProvider']
elif provider == 'rocm':
execution_providers = ['ROCMExecutionProvider', 'CPUExecutionProvider']
elif provider == 'migraphx':
execution_providers = ['MIGraphXExecutionProvider', 'ROCMExecutionProvider', 'CPUExecutionProvider']
elif provider == 'cuda':
execution_providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
elif provider == 'tensorrt':
execution_providers = ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']
else:
execution_providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
else:
execution_providers = ['CPUExecutionProvider']
sess_options = onnxruntime.SessionOptions()
sess_options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
@ -86,7 +100,23 @@ def create_session(model_path, use_gpu, intra_op_num_threads, graph_optimization
session = onnxruntime.InferenceSession(model_path, sess_options, providers=execution_providers)
if use_gpu:
assert 'CUDAExecutionProvider' in session.get_providers()
if provider == 'dml':
assert 'DmlExecutionProvider' in session.get_providers()
elif provider == 'rocm':
assert 'ROCMExecutionProvider' in session.get_providers()
elif provider == 'migraphx':
assert 'MIGraphXExecutionProvider' in session.get_providers()
assert 'ROCMExecutionProvider' in session.get_providers()
elif provider == 'cuda':
assert 'CUDAExecutionProvider' in session.get_providers()
elif provider == 'tensorrt':
assert 'TensorrtExecutionProvider' in session.get_providers()
assert 'CUDAExecutionProvider' in session.get_providers()
else:
assert 'CUDAExecutionProvider' in session.get_providers()
else:
assert 'CPUExecutionProvider' in session.get_providers()
return session
@ -117,7 +147,7 @@ def to_string(model_path, session, test_setting):
def run_one_test(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads):
session = create_session(model_setting.model_path, test_setting.use_gpu, intra_op_num_threads,
session = create_session(model_setting.model_path, test_setting.use_gpu, test_setting.provider, intra_op_num_threads,
model_setting.opt_level)
output_names = [output.name for output in session.get_outputs()]
@ -239,6 +269,12 @@ def parse_arguments():
parser.add_argument('--use_gpu', required=False, action='store_true', help="use GPU")
parser.set_defaults(use_gpu=False)
parser.add_argument("--provider",
required=False,
type=str,
default=None,
help="Execution provider to use")
parser.add_argument('-n',
'--intra_op_num_threads',
required=False,
@ -276,7 +312,7 @@ def main():
for batch_size in batch_size_set:
test_setting = TestSetting(batch_size, args.sequence_length, args.samples, args.test_times, args.use_gpu,
args.intra_op_num_threads, args.seed, args.verbose)
args.provider, args.intra_op_num_threads, args.seed, args.verbose)
print("test setting", test_setting)
run_performance(model_setting, test_setting, perf_results)

View file

@ -6,7 +6,8 @@
# Maps model class name to a tuple of model class
MODEL_CLASSES = [
'AutoModel', 'AutoModelWithLMHead', 'AutoModelForSequenceClassification', 'AutoModelForQuestionAnswering'
'AutoModel', 'AutoModelWithLMHead', 'AutoModelForSequenceClassification', 'AutoModelForQuestionAnswering',
'AutoModelForCausalLM',
]
# List of pretrained models: https://huggingface.co/transformers/pretrained_models.html

View file

@ -255,6 +255,7 @@ def load_pretrained_model(model_name, config, cache_dir, custom_model_class, is_
model_class_name = 'TF' + model_class_name
transformers_module = __import__("transformers", fromlist=[model_class_name])
logger.info(f"Model class name: {model_class_name}")
model_class = getattr(transformers_module, model_class_name)
return model_class.from_pretrained(model_name, config=config, cache_dir=cache_dir)

View file

@ -86,8 +86,11 @@ def parse_arguments(argv=None):
parser.add_argument('-g', '--use_gpu', required=False, action='store_true', help="use GPU")
parser.set_defaults(use_gpu=False)
parser.add_argument('-d', '--use_dml', required=False, action='store_true', help="use DML")
parser.set_defaults(use_dml=False)
parser.add_argument('--provider',
required=False,
type=str,
default='cuda',
help="Execution provider to use")
parser.add_argument(
'--basic_optimization',
@ -108,15 +111,15 @@ def parse_arguments(argv=None):
return parser.parse_args(argv)
def run_profile(onnx_model_path, use_gpu, basic_optimization, thread_num, all_inputs, use_dml):
def run_profile(onnx_model_path, use_gpu, provider, basic_optimization, thread_num, all_inputs):
from benchmark_helper import create_onnxruntime_session
session = create_onnxruntime_session(onnx_model_path,
use_gpu,
provider,
enable_all_optimization=not basic_optimization,
num_threads=thread_num,
enable_profiling=True,
use_dml=use_dml)
enable_profiling=True)
for inputs in all_inputs:
_ = session.run(None, inputs)
@ -604,7 +607,7 @@ def run(args):
else: # default
all_inputs = create_dummy_inputs(onnx_model, args.batch_size, args.sequence_length, args.samples)
profile_file = run_profile(args.model, args.use_gpu, args.basic_optimization, args.thread_num, all_inputs, args.use_dml)
profile_file = run_profile(args.model, args.use_gpu, args.provider, args.basic_optimization, args.thread_num, all_inputs)
return profile_file