mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-24 22:17:32 +00:00
Support specifying an execution provider in benchmark script (#10453)
* Support specifying execution providers. * Change default provider setting to None. * Add support for bert_perf_test script. * Fall back to ROCM/CUDA EP for MIGraphX/Tensorrt EP. * Assert fall back EPs are included. * Add model class AutoModelForCausalLM and other minor updates. Co-authored-by: Yao Zhang <zhanyao@microsoft.com>
This commit is contained in:
parent
a405658370
commit
239c6ad3f0
6 changed files with 79 additions and 22 deletions
|
|
@ -68,13 +68,14 @@ import torch
|
|||
from transformers import (AutoConfig, AutoTokenizer, AutoModel, GPT2Model, LxmertConfig)
|
||||
|
||||
|
||||
def run_onnxruntime(use_gpu, model_names, model_class, precision, num_threads, batch_sizes, sequence_lengths,
|
||||
def run_onnxruntime(use_gpu, provider, model_names, model_class, precision, num_threads, batch_sizes, sequence_lengths,
|
||||
repeat_times, input_counts, optimize_onnx, validate_onnx, cache_dir, onnx_dir, verbose, overwrite,
|
||||
disable_ort_io_binding, use_raw_attention_mask, model_fusion_statistics, model_source):
|
||||
import onnxruntime
|
||||
|
||||
results = []
|
||||
if use_gpu and ('CUDAExecutionProvider' not in onnxruntime.get_available_providers()):
|
||||
if (use_gpu and ('CUDAExecutionProvider' not in onnxruntime.get_available_providers()) and
|
||||
('ROCMExecutionProvider' not in onnxruntime.get_available_providers())):
|
||||
logger.error(
|
||||
"Please install onnxruntime-gpu package instead of onnxruntime, and use a machine with GPU for testing gpu performance."
|
||||
)
|
||||
|
|
@ -105,6 +106,7 @@ def run_onnxruntime(use_gpu, model_names, model_class, precision, num_threads, b
|
|||
|
||||
ort_session = create_onnxruntime_session(onnx_model_file,
|
||||
use_gpu,
|
||||
provider,
|
||||
enable_all_optimization=True,
|
||||
num_threads=num_threads,
|
||||
verbose=verbose)
|
||||
|
|
@ -425,7 +427,13 @@ def parse_arguments():
|
|||
default=os.path.join('.', 'onnx_models'),
|
||||
help="Directory to store onnx models")
|
||||
|
||||
parser.add_argument("-g", "--use_gpu", required=False, action="store_true", help="Run on cuda device")
|
||||
parser.add_argument("-g", "--use_gpu", required=False, action="store_true", help="Run on gpu device")
|
||||
|
||||
parser.add_argument("--provider",
|
||||
required=False,
|
||||
type=str,
|
||||
default=None,
|
||||
help="Execution provider to use")
|
||||
|
||||
parser.add_argument(
|
||||
"-p",
|
||||
|
|
@ -545,7 +553,7 @@ def main():
|
|||
if enable_onnxruntime:
|
||||
try:
|
||||
use_raw_attention_mask = True
|
||||
results += run_onnxruntime(args.use_gpu, args.models, args.model_class, args.precision, num_threads,
|
||||
results += run_onnxruntime(args.use_gpu, args.provider, args.models, args.model_class, args.precision, num_threads,
|
||||
args.batch_sizes, args.sequence_lengths, args.test_times, args.input_counts,
|
||||
args.optimize_onnx, args.validate_onnx, args.cache_dir, args.onnx_dir,
|
||||
args.verbose, args.overwrite, args.disable_ort_io_binding,
|
||||
|
|
|
|||
|
|
@ -39,11 +39,11 @@ IO_BINDING_DATA_TYPE_MAP = {
|
|||
|
||||
def create_onnxruntime_session(onnx_model_path,
|
||||
use_gpu,
|
||||
provider=None,
|
||||
enable_all_optimization=True,
|
||||
num_threads=-1,
|
||||
enable_profiling=False,
|
||||
verbose=False,
|
||||
use_dml=False):
|
||||
verbose=False):
|
||||
session = None
|
||||
try:
|
||||
from onnxruntime import SessionOptions, InferenceSession, GraphOptimizationLevel, __version__ as onnxruntime_version
|
||||
|
|
@ -68,8 +68,16 @@ def create_onnxruntime_session(onnx_model_path,
|
|||
|
||||
logger.debug(f"Create session for onnx model: {onnx_model_path}")
|
||||
if use_gpu:
|
||||
if use_dml:
|
||||
if provider == 'dml':
|
||||
execution_providers = ['DmlExecutionProvider', 'CPUExecutionProvider']
|
||||
elif provider == 'rocm':
|
||||
execution_providers = ['ROCMExecutionProvider', 'CPUExecutionProvider']
|
||||
elif provider == 'migraphx':
|
||||
execution_providers = ['MIGraphXExecutionProvider', 'ROCMExecutionProvider', 'CPUExecutionProvider']
|
||||
elif provider == 'cuda':
|
||||
execution_providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
|
||||
elif provider == 'tensorrt':
|
||||
execution_providers = ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']
|
||||
else:
|
||||
execution_providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
|
||||
else:
|
||||
|
|
@ -89,7 +97,7 @@ def setup_logger(verbose=True):
|
|||
logging.getLogger("transformers").setLevel(logging.WARNING)
|
||||
|
||||
|
||||
def prepare_environment(cache_dir, output_dir, use_gpu, use_dml=False):
|
||||
def prepare_environment(cache_dir, output_dir, use_gpu, provider=None):
|
||||
if cache_dir and not os.path.exists(cache_dir):
|
||||
os.makedirs(cache_dir)
|
||||
|
||||
|
|
@ -98,7 +106,7 @@ def prepare_environment(cache_dir, output_dir, use_gpu, use_dml=False):
|
|||
|
||||
import onnxruntime
|
||||
if use_gpu:
|
||||
if use_dml:
|
||||
if provider == 'dml':
|
||||
assert 'DmlExecutionProvider' in onnxruntime.get_available_providers(
|
||||
), "Please install onnxruntime-directml package to test GPU inference."
|
||||
|
||||
|
|
|
|||
|
|
@ -36,6 +36,7 @@ class TestSetting:
|
|||
test_cases: int
|
||||
test_times: int
|
||||
use_gpu: bool
|
||||
provider: str
|
||||
intra_op_num_threads: int
|
||||
seed: int
|
||||
verbose: bool
|
||||
|
|
@ -50,7 +51,7 @@ class ModelSetting:
|
|||
opt_level: int
|
||||
|
||||
|
||||
def create_session(model_path, use_gpu, intra_op_num_threads, graph_optimization_level=None):
|
||||
def create_session(model_path, use_gpu, provider, intra_op_num_threads, graph_optimization_level=None):
|
||||
import onnxruntime
|
||||
|
||||
if use_gpu and ('CUDAExecutionProvider' not in onnxruntime.get_available_providers()):
|
||||
|
|
@ -61,8 +62,21 @@ def create_session(model_path, use_gpu, intra_op_num_threads, graph_optimization
|
|||
if intra_op_num_threads is None and graph_optimization_level is None:
|
||||
session = onnxruntime.InferenceSession(model_path)
|
||||
else:
|
||||
execution_providers = ['CPUExecutionProvider'
|
||||
] if not use_gpu else ['CUDAExecutionProvider', 'CPUExecutionProvider']
|
||||
if use_gpu:
|
||||
if provider == 'dml':
|
||||
execution_providers = ['DmlExecutionProvider', 'CPUExecutionProvider']
|
||||
elif provider == 'rocm':
|
||||
execution_providers = ['ROCMExecutionProvider', 'CPUExecutionProvider']
|
||||
elif provider == 'migraphx':
|
||||
execution_providers = ['MIGraphXExecutionProvider', 'ROCMExecutionProvider', 'CPUExecutionProvider']
|
||||
elif provider == 'cuda':
|
||||
execution_providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
|
||||
elif provider == 'tensorrt':
|
||||
execution_providers = ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']
|
||||
else:
|
||||
execution_providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
|
||||
else:
|
||||
execution_providers = ['CPUExecutionProvider']
|
||||
|
||||
sess_options = onnxruntime.SessionOptions()
|
||||
sess_options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
|
||||
|
|
@ -86,7 +100,23 @@ def create_session(model_path, use_gpu, intra_op_num_threads, graph_optimization
|
|||
session = onnxruntime.InferenceSession(model_path, sess_options, providers=execution_providers)
|
||||
|
||||
if use_gpu:
|
||||
assert 'CUDAExecutionProvider' in session.get_providers()
|
||||
if provider == 'dml':
|
||||
assert 'DmlExecutionProvider' in session.get_providers()
|
||||
elif provider == 'rocm':
|
||||
assert 'ROCMExecutionProvider' in session.get_providers()
|
||||
elif provider == 'migraphx':
|
||||
assert 'MIGraphXExecutionProvider' in session.get_providers()
|
||||
assert 'ROCMExecutionProvider' in session.get_providers()
|
||||
elif provider == 'cuda':
|
||||
assert 'CUDAExecutionProvider' in session.get_providers()
|
||||
elif provider == 'tensorrt':
|
||||
assert 'TensorrtExecutionProvider' in session.get_providers()
|
||||
assert 'CUDAExecutionProvider' in session.get_providers()
|
||||
else:
|
||||
assert 'CUDAExecutionProvider' in session.get_providers()
|
||||
else:
|
||||
assert 'CPUExecutionProvider' in session.get_providers()
|
||||
|
||||
return session
|
||||
|
||||
|
||||
|
|
@ -117,7 +147,7 @@ def to_string(model_path, session, test_setting):
|
|||
|
||||
|
||||
def run_one_test(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads):
|
||||
session = create_session(model_setting.model_path, test_setting.use_gpu, intra_op_num_threads,
|
||||
session = create_session(model_setting.model_path, test_setting.use_gpu, test_setting.provider, intra_op_num_threads,
|
||||
model_setting.opt_level)
|
||||
output_names = [output.name for output in session.get_outputs()]
|
||||
|
||||
|
|
@ -239,6 +269,12 @@ def parse_arguments():
|
|||
parser.add_argument('--use_gpu', required=False, action='store_true', help="use GPU")
|
||||
parser.set_defaults(use_gpu=False)
|
||||
|
||||
parser.add_argument("--provider",
|
||||
required=False,
|
||||
type=str,
|
||||
default=None,
|
||||
help="Execution provider to use")
|
||||
|
||||
parser.add_argument('-n',
|
||||
'--intra_op_num_threads',
|
||||
required=False,
|
||||
|
|
@ -276,7 +312,7 @@ def main():
|
|||
|
||||
for batch_size in batch_size_set:
|
||||
test_setting = TestSetting(batch_size, args.sequence_length, args.samples, args.test_times, args.use_gpu,
|
||||
args.intra_op_num_threads, args.seed, args.verbose)
|
||||
args.provider, args.intra_op_num_threads, args.seed, args.verbose)
|
||||
|
||||
print("test setting", test_setting)
|
||||
run_performance(model_setting, test_setting, perf_results)
|
||||
|
|
|
|||
|
|
@ -6,7 +6,8 @@
|
|||
|
||||
# Maps model class name to a tuple of model class
|
||||
MODEL_CLASSES = [
|
||||
'AutoModel', 'AutoModelWithLMHead', 'AutoModelForSequenceClassification', 'AutoModelForQuestionAnswering'
|
||||
'AutoModel', 'AutoModelWithLMHead', 'AutoModelForSequenceClassification', 'AutoModelForQuestionAnswering',
|
||||
'AutoModelForCausalLM',
|
||||
]
|
||||
|
||||
# List of pretrained models: https://huggingface.co/transformers/pretrained_models.html
|
||||
|
|
|
|||
|
|
@ -255,6 +255,7 @@ def load_pretrained_model(model_name, config, cache_dir, custom_model_class, is_
|
|||
model_class_name = 'TF' + model_class_name
|
||||
|
||||
transformers_module = __import__("transformers", fromlist=[model_class_name])
|
||||
logger.info(f"Model class name: {model_class_name}")
|
||||
model_class = getattr(transformers_module, model_class_name)
|
||||
|
||||
return model_class.from_pretrained(model_name, config=config, cache_dir=cache_dir)
|
||||
|
|
|
|||
|
|
@ -86,8 +86,11 @@ def parse_arguments(argv=None):
|
|||
parser.add_argument('-g', '--use_gpu', required=False, action='store_true', help="use GPU")
|
||||
parser.set_defaults(use_gpu=False)
|
||||
|
||||
parser.add_argument('-d', '--use_dml', required=False, action='store_true', help="use DML")
|
||||
parser.set_defaults(use_dml=False)
|
||||
parser.add_argument('--provider',
|
||||
required=False,
|
||||
type=str,
|
||||
default='cuda',
|
||||
help="Execution provider to use")
|
||||
|
||||
parser.add_argument(
|
||||
'--basic_optimization',
|
||||
|
|
@ -108,15 +111,15 @@ def parse_arguments(argv=None):
|
|||
return parser.parse_args(argv)
|
||||
|
||||
|
||||
def run_profile(onnx_model_path, use_gpu, basic_optimization, thread_num, all_inputs, use_dml):
|
||||
def run_profile(onnx_model_path, use_gpu, provider, basic_optimization, thread_num, all_inputs):
|
||||
from benchmark_helper import create_onnxruntime_session
|
||||
|
||||
session = create_onnxruntime_session(onnx_model_path,
|
||||
use_gpu,
|
||||
provider,
|
||||
enable_all_optimization=not basic_optimization,
|
||||
num_threads=thread_num,
|
||||
enable_profiling=True,
|
||||
use_dml=use_dml)
|
||||
enable_profiling=True)
|
||||
|
||||
for inputs in all_inputs:
|
||||
_ = session.run(None, inputs)
|
||||
|
|
@ -604,7 +607,7 @@ def run(args):
|
|||
else: # default
|
||||
all_inputs = create_dummy_inputs(onnx_model, args.batch_size, args.sequence_length, args.samples)
|
||||
|
||||
profile_file = run_profile(args.model, args.use_gpu, args.basic_optimization, args.thread_num, all_inputs, args.use_dml)
|
||||
profile_file = run_profile(args.model, args.use_gpu, args.provider, args.basic_optimization, args.thread_num, all_inputs)
|
||||
|
||||
return profile_file
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue