mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-25 22:26:24 +00:00
Update transformers benchmark for transformers 4.3.* and ORT 1.7 (#6796)
* update benchmark for transformers 4.* and ORT 1.7 * Fix gpt2 onnx conversion for transformers 4.3.*. Add a check of transformer version >= 3.1. * remove code related to openmp * update pretrain model list: keep representitive models only
This commit is contained in:
parent
71a70ecf6e
commit
f4acdb2ecd
12 changed files with 137 additions and 316 deletions
|
|
@ -210,7 +210,7 @@ For GPU, please append --use_gpu to the command.
|
|||
bert_perf_test.py can be used to check the BERT model inference performance. Below are examples:
|
||||
|
||||
```console
|
||||
python -m onnxruntime.transformers.bert_perf_test --model optimized_model_cpu.onnx --batch_size 1 --sequence_length 128 --samples 100 --test_times 10 --inclusive
|
||||
python -m onnxruntime.transformers.bert_perf_test --model optimized_model_cpu.onnx --batch_size 1 --sequence_length 128
|
||||
```
|
||||
|
||||
For GPU, please append --use_gpu to the command.
|
||||
|
|
@ -219,7 +219,7 @@ After test is finished, a file like perf_results_CPU_B1_S128_<date_time>.txt or
|
|||
|
||||
## Profiling
|
||||
|
||||
profiler.py can be used to run profiling on a transformer model. It can help figure out the bottleneck of a model, and time spent on a node or subgraph.
|
||||
profiler.py can be used to run profiling on a transformer model. It can help figure out the bottleneck of a model, and CPU time spent on a node or subgraph.
|
||||
|
||||
Examples commands:
|
||||
|
||||
|
|
|
|||
|
|
@ -80,9 +80,6 @@ def run_onnxruntime(use_gpu, model_names, model_class, precision, num_threads, b
|
|||
)
|
||||
return results
|
||||
|
||||
if (not use_gpu) and ('CUDAExecutionProvider' in onnxruntime.get_available_providers()):
|
||||
logger.warning("Please install onnxruntime package instead of onnxruntime-gpu to get best cpu performance.")
|
||||
|
||||
for model_name in model_names:
|
||||
all_input_names = MODELS[model_name][0]
|
||||
for num_inputs in input_counts:
|
||||
|
|
|
|||
|
|
@ -16,6 +16,7 @@ import argparse
|
|||
import logging
|
||||
import torch
|
||||
import onnx
|
||||
from packaging import version
|
||||
from transformers import AutoConfig
|
||||
from gpt2_helper import Gpt2Helper, MODEL_CLASSES, DEFAULT_TOLERANCE, PRETRAINED_GPT2_MODELS
|
||||
from quantize_helper import QuantizeHelper
|
||||
|
|
@ -113,6 +114,10 @@ def parse_arguments(argv=None):
|
|||
|
||||
|
||||
def main(args):
|
||||
from transformers import __version__ as transformers_version
|
||||
if version.parse(transformers_version) < version.parse("3.1.0"): # past_key_values name does not exist in 3.0.2 or older
|
||||
raise RuntimeError("This tool requires transformers 3.1.0 or later.")
|
||||
|
||||
logger.info(f"Arguments:{args}")
|
||||
if args.precision == Precision.FLOAT16:
|
||||
assert args.optimize_onnx and args.use_gpu, "fp16 requires --optimize_onnx --use_gpu"
|
||||
|
|
@ -279,7 +284,7 @@ def main(args):
|
|||
return csv_filename
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == '__main__':
|
||||
args = parse_arguments()
|
||||
setup_logger(args.verbose)
|
||||
main(args)
|
||||
|
|
|
|||
|
|
@ -35,44 +35,10 @@ class TestSetting:
|
|||
sequence_length: int
|
||||
test_cases: int
|
||||
test_times: int
|
||||
contiguous: bool
|
||||
use_gpu: bool
|
||||
warmup: bool
|
||||
omp_num_threads: int
|
||||
omp_wait_policy: str
|
||||
intra_op_num_threads: int
|
||||
seed: int
|
||||
verbose: bool
|
||||
contiguous: bool
|
||||
inclusive: bool
|
||||
extra_latency: float = 0
|
||||
|
||||
def get_setting(self) -> str:
|
||||
return f"batch_size={self.batch_size},sequence_length={self.sequence_length},test_cases={self.test_cases},test_times={self.test_times},contiguous={self.contiguous},use_gpu={self.use_gpu},warmup={self.warmup}"
|
||||
|
||||
def check(self, intra_op_threads, omp_threads, omp_policy) -> bool:
|
||||
if intra_op_threads is None:
|
||||
if self.intra_op_num_threads is not None and self.intra_op_num_threads > 0:
|
||||
return False
|
||||
else:
|
||||
assert intra_op_threads > 0
|
||||
if not (self.intra_op_num_threads is None or self.intra_op_num_threads == intra_op_threads):
|
||||
return False
|
||||
|
||||
if omp_threads is None:
|
||||
if self.omp_num_threads is not None and self.omp_num_threads > 0:
|
||||
return False
|
||||
else:
|
||||
assert omp_threads > 0
|
||||
if not (self.omp_num_threads is None or self.omp_num_threads == omp_threads):
|
||||
return False
|
||||
|
||||
if self.omp_wait_policy is not None:
|
||||
if omp_policy != self.omp_wait_policy:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModelSetting:
|
||||
|
|
@ -84,22 +50,17 @@ class ModelSetting:
|
|||
|
||||
|
||||
def create_session(model_path, use_gpu, intra_op_num_threads, graph_optimization_level=None):
|
||||
# Import onnxruntime shall be after OpenMP environment variable setting.
|
||||
# So we put the import in function to delay importing instead of top of this script.
|
||||
import onnxruntime
|
||||
|
||||
if use_gpu and ('CUDAExecutionProvider' not in onnxruntime.get_available_providers()):
|
||||
print(
|
||||
"Warning: Please install onnxruntime-gpu package instead of onnxruntime, and use a machine with GPU for testing gpu performance."
|
||||
)
|
||||
elif (not use_gpu) and ('CUDAExecutionProvider' in onnxruntime.get_available_providers()):
|
||||
print("Warning: Please install onnxruntime package instead of onnxruntime-gpu to get best cpu performance.")
|
||||
|
||||
if intra_op_num_threads is None and graph_optimization_level is None:
|
||||
session = onnxruntime.InferenceSession(model_path)
|
||||
else:
|
||||
execution_providers = ['CPUExecutionProvider'
|
||||
] if not use_gpu else ['CUDAExecutionProvider', 'CPUExecutionProvider']
|
||||
execution_providers = ['CPUExecutionProvider'] if not use_gpu else ['CUDAExecutionProvider', 'CPUExecutionProvider']
|
||||
|
||||
sess_options = onnxruntime.SessionOptions()
|
||||
sess_options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
|
||||
|
|
@ -127,8 +88,8 @@ def create_session(model_path, use_gpu, intra_op_num_threads, graph_optimization
|
|||
return session
|
||||
|
||||
|
||||
def onnxruntime_inference(session, all_inputs, output_names, warmup=True):
|
||||
if warmup and len(all_inputs) > 0:
|
||||
def onnxruntime_inference(session, all_inputs, output_names):
|
||||
if len(all_inputs) > 0:
|
||||
# Use a random input as warm up.
|
||||
session.run(output_names, random.choice(all_inputs))
|
||||
|
||||
|
|
@ -142,57 +103,16 @@ def onnxruntime_inference(session, all_inputs, output_names, warmup=True):
|
|||
latency_list.append(latency)
|
||||
return results, latency_list
|
||||
|
||||
|
||||
def get_contiguous_inputs(all_inputs):
|
||||
"""
|
||||
Convert input to be contiguous.
|
||||
"""
|
||||
contiguous_inputs = []
|
||||
|
||||
start_time = timeit.default_timer()
|
||||
for test_case_id, inputs in enumerate(all_inputs):
|
||||
real_inputs = {}
|
||||
for key, value in inputs.items():
|
||||
real_inputs[key] = np.ascontiguousarray(value)
|
||||
contiguous_inputs.append(real_inputs)
|
||||
latency = timeit.default_timer() - start_time
|
||||
|
||||
average_latency_ms = latency / len(contiguous_inputs) * 1000
|
||||
return contiguous_inputs, average_latency_ms
|
||||
|
||||
|
||||
def to_string(model_path, session, test_setting):
|
||||
sess_options = session.get_session_options()
|
||||
option = "model={}".format(os.path.basename(model_path))
|
||||
option += ",graph_optimization_level={},intra_op_num_threads={}".format(sess_options.graph_optimization_level,
|
||||
option = "model={},".format(os.path.basename(model_path))
|
||||
option += "graph_optimization_level={},intra_op_num_threads={},".format(sess_options.graph_optimization_level,
|
||||
sess_options.intra_op_num_threads).replace(
|
||||
'GraphOptimizationLevel.ORT_', '')
|
||||
option += ",OMP_NUM_THREADS={}".format(os.environ["OMP_NUM_THREADS"] if "OMP_NUM_THREADS" in os.environ else "")
|
||||
option += ",OMP_WAIT_POLICY={}".format(os.environ["OMP_WAIT_POLICY"] if "OMP_WAIT_POLICY" in os.environ else "")
|
||||
option += ",{}".format(test_setting.get_setting())
|
||||
option += f"batch_size={test_setting.batch_size},sequence_length={test_setting.sequence_length},test_cases={test_setting.test_cases},test_times={test_setting.test_times},use_gpu={test_setting.use_gpu}"
|
||||
return option
|
||||
|
||||
|
||||
def setup_openmp_environ(omp_num_threads, omp_wait_policy):
|
||||
if omp_num_threads is None:
|
||||
if "OMP_NUM_THREADS" in os.environ:
|
||||
del os.environ["OMP_NUM_THREADS"]
|
||||
else:
|
||||
os.environ["OMP_NUM_THREADS"] = str(omp_num_threads)
|
||||
|
||||
if omp_wait_policy is None:
|
||||
if "OMP_WAIT_POLICY" in os.environ:
|
||||
del os.environ["OMP_WAIT_POLICY"]
|
||||
else:
|
||||
assert omp_wait_policy in ["ACTIVE", "PASSIVE"], f"{omp_wait_policy} is not a valid policy"
|
||||
os.environ["OMP_WAIT_POLICY"] = omp_wait_policy
|
||||
|
||||
|
||||
def run_one_test(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads, omp_num_threads,
|
||||
omp_wait_policy):
|
||||
# Environment variable shall be set before import onnxruntime.
|
||||
setup_openmp_environ(omp_num_threads, omp_wait_policy)
|
||||
|
||||
def run_one_test(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads):
|
||||
session = create_session(model_setting.model_path, test_setting.use_gpu, intra_op_num_threads,
|
||||
model_setting.opt_level)
|
||||
output_names = [output.name for output in session.get_outputs()]
|
||||
|
|
@ -206,11 +126,11 @@ def run_one_test(model_setting, test_setting, perf_results, all_inputs, intra_op
|
|||
|
||||
all_latency_list = []
|
||||
for i in range(test_setting.test_times):
|
||||
results, latency_list = onnxruntime_inference(session, all_inputs, output_names, test_setting.warmup)
|
||||
results, latency_list = onnxruntime_inference(session, all_inputs, output_names)
|
||||
all_latency_list.extend(latency_list)
|
||||
|
||||
# latency in miliseconds
|
||||
latency_ms = np.array(all_latency_list) * 1000 + test_setting.extra_latency
|
||||
latency_ms = np.array(all_latency_list) * 1000
|
||||
|
||||
average_latency = statistics.mean(latency_ms)
|
||||
latency_50 = np.percentile(latency_ms, 50)
|
||||
|
|
@ -226,91 +146,31 @@ def run_one_test(model_setting, test_setting, perf_results, all_inputs, intra_op
|
|||
format(throughput, '.2f')))
|
||||
|
||||
|
||||
def launch_test(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads, omp_num_threads,
|
||||
omp_wait_policy):
|
||||
if not test_setting.check(intra_op_num_threads, omp_num_threads, omp_wait_policy):
|
||||
return
|
||||
|
||||
def launch_test(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads):
|
||||
process = multiprocessing.Process(target=run_one_test,
|
||||
args=(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads,
|
||||
omp_num_threads, omp_wait_policy))
|
||||
args=(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads))
|
||||
process.start()
|
||||
process.join()
|
||||
|
||||
|
||||
def run_perf_tests(model_setting, test_setting, perf_results, test_all, all_inputs):
|
||||
def run_perf_tests(model_setting, test_setting, perf_results, all_inputs):
|
||||
if (test_setting.intra_op_num_threads is not None):
|
||||
launch_test(model_setting, test_setting, perf_results, all_inputs, test_setting.intra_op_num_threads)
|
||||
return
|
||||
|
||||
cpu_count = psutil.cpu_count(logical=False)
|
||||
logical_cores = psutil.cpu_count(logical=True)
|
||||
|
||||
candidate_threads = list(set([1, logical_cores, cpu_count]))
|
||||
|
||||
if (test_setting.intra_op_num_threads is not None) or (test_setting.omp_num_threads is not None):
|
||||
|
||||
if test_setting.intra_op_num_threads is not None:
|
||||
intra_op_threads = [test_setting.intra_op_num_threads]
|
||||
else:
|
||||
intra_op_threads = [None] + candidate_threads
|
||||
|
||||
if test_setting.omp_num_threads is not None:
|
||||
omp_threads = [test_setting.omp_num_threads]
|
||||
else:
|
||||
omp_threads = [None] + candidate_threads
|
||||
|
||||
if test_setting.omp_wait_policy is not None:
|
||||
omp_policies = [test_setting.omp_wait_policy]
|
||||
else:
|
||||
omp_policies = [None, 'PASSIVE', 'ACTIVE']
|
||||
|
||||
for it in intra_op_threads:
|
||||
for ot in omp_threads:
|
||||
for op in omp_policies:
|
||||
launch_test(model_setting, test_setting, perf_results, all_inputs, it, ot, op)
|
||||
return
|
||||
|
||||
# Test a setting without any setting as baseline 1.
|
||||
launch_test(model_setting, test_setting, perf_results, all_inputs, None, None, None)
|
||||
|
||||
if not test_setting.use_gpu:
|
||||
# For CPU: intra_op_num_threads = 1, omp_num_threads=None, omp_wait_policy=None
|
||||
# Another setting without environment variable as baseline 2.
|
||||
launch_test(model_setting, test_setting, perf_results, all_inputs, 1, None, None)
|
||||
else:
|
||||
# For GPU, we test two more settings by default:
|
||||
# (1) intra_op_num_threads = 1, omp_num_threads=cpu_count, omp_wait_policy=PASSIVE
|
||||
# (2) intra_op_num_threads = logical_cores, omp_num_threads=1, omp_wait_policy=ACTIVE
|
||||
launch_test(model_setting, test_setting, perf_results, all_inputs, 1, cpu_count, 'PASSIVE')
|
||||
|
||||
launch_test(model_setting, test_setting, perf_results, all_inputs, logical_cores, 1, 'ACTIVE')
|
||||
|
||||
# GPU latency is not sensitive to these settings. No need to test many combinations.
|
||||
# Skip remaining settings for GPU without --all flag.
|
||||
if test_setting.use_gpu and not test_all:
|
||||
return
|
||||
candidate_threads = list(set([logical_cores, cpu_count]))
|
||||
for i in range(1, min(16, logical_cores)):
|
||||
if i not in candidate_threads:
|
||||
candidate_threads.append(i)
|
||||
candidate_threads.sort(reverse=True)
|
||||
|
||||
for intra_op_num_threads in candidate_threads:
|
||||
for omp_num_threads in candidate_threads:
|
||||
# skip settings that are very slow
|
||||
if intra_op_num_threads == 1 and omp_num_threads == 1 and logical_cores != 1:
|
||||
continue
|
||||
|
||||
# When logical and physical cores are not the same, there are many combinations.
|
||||
# Remove some settings are not good normally.
|
||||
if logical_cores > cpu_count:
|
||||
if omp_num_threads == logical_cores and intra_op_num_threads != 1:
|
||||
continue
|
||||
if intra_op_num_threads == logical_cores and omp_num_threads != 1:
|
||||
continue
|
||||
|
||||
if not test_all:
|
||||
if intra_op_num_threads != 1 and omp_num_threads != 1:
|
||||
continue
|
||||
|
||||
for omp_wait_policy in ['ACTIVE', 'PASSIVE']:
|
||||
launch_test(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads,
|
||||
omp_num_threads, omp_wait_policy)
|
||||
|
||||
|
||||
def run_performance(model_setting, test_setting, perf_results, test_all):
|
||||
launch_test(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads)
|
||||
|
||||
def run_performance(model_setting, test_setting, perf_results):
|
||||
input_ids, segment_ids, input_mask = get_bert_inputs(model_setting.model_path, model_setting.input_ids_name,
|
||||
model_setting.segment_ids_name, model_setting.input_mask_name)
|
||||
|
||||
|
|
@ -327,29 +187,25 @@ def run_performance(model_setting, test_setting, perf_results, test_all):
|
|||
segment_ids,
|
||||
input_mask,
|
||||
random_mask_length=False)
|
||||
if test_setting.contiguous:
|
||||
all_inputs, contiguous_latency = get_contiguous_inputs(all_inputs)
|
||||
print("Extra latency for converting inputs to contiguous: {} ms".format(format(contiguous_latency, '.2f')))
|
||||
test_setting.extra_latency = contiguous_latency if test_setting.inclusive else 0
|
||||
|
||||
run_perf_tests(model_setting, test_setting, perf_results, test_all, all_inputs)
|
||||
run_perf_tests(model_setting, test_setting, perf_results, all_inputs)
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--model', required=True, type=str, help="bert onnx model path")
|
||||
|
||||
parser.add_argument('--batch_size',
|
||||
parser.add_argument('-b', '--batch_size',
|
||||
required=True,
|
||||
type=int,
|
||||
nargs="+",
|
||||
help="batch size of input. Allow one or multiple values in the range of [1, 128].")
|
||||
|
||||
parser.add_argument('--sequence_length', required=True, type=int, help="maximum sequence length of input")
|
||||
parser.add_argument('-s', '--sequence_length', required=True, type=int, help="maximum sequence length of input")
|
||||
|
||||
parser.add_argument('--samples', required=False, type=int, default=10, help="number of samples to be generated")
|
||||
|
||||
parser.add_argument('--test_times',
|
||||
parser.add_argument('-t', '--test_times',
|
||||
required=False,
|
||||
type=int,
|
||||
default=0,
|
||||
|
|
@ -375,40 +231,12 @@ def parse_arguments():
|
|||
parser.add_argument('--use_gpu', required=False, action='store_true', help="use GPU")
|
||||
parser.set_defaults(use_gpu=False)
|
||||
|
||||
parser.add_argument('--inclusive',
|
||||
required=False,
|
||||
action='store_true',
|
||||
help="include the latency of converting array to contiguous")
|
||||
parser.set_defaults(inclusive=False)
|
||||
|
||||
parser.add_argument('--all', required=False, action='store_true', help="test all candidate settings")
|
||||
parser.set_defaults(all=False)
|
||||
|
||||
parser.add_argument('--omp_num_threads',
|
||||
required=False,
|
||||
type=int,
|
||||
default=None,
|
||||
help=">0, set OMP_NUM_THREADS value. 0, do not set")
|
||||
|
||||
parser.add_argument('--intra_op_num_threads',
|
||||
parser.add_argument('-n', '--intra_op_num_threads',
|
||||
required=False,
|
||||
type=int,
|
||||
default=None,
|
||||
help=">=0, set intra_op_num_threads")
|
||||
|
||||
parser.add_argument('--omp_wait_policy',
|
||||
required=False,
|
||||
type=str,
|
||||
default=None,
|
||||
choices=['ACTIVE', 'PASSIVE'],
|
||||
help="OMP_WAIT_POLICY")
|
||||
|
||||
parser.add_argument('--contiguous', required=False, action='store_true', help="contiguous input")
|
||||
parser.set_defaults(contiguous=False)
|
||||
|
||||
parser.add_argument('--no_warmup', required=False, action='store_true', help="do not use one sample for warm-up.")
|
||||
parser.set_defaults(no_warmup=False)
|
||||
|
||||
parser.add_argument('--input_ids_name', required=False, type=str, default=None, help="input name for input ids")
|
||||
parser.add_argument('--segment_ids_name', required=False, type=str, default=None, help="input name for segment ids")
|
||||
parser.add_argument('--input_mask_name',
|
||||
|
|
@ -443,18 +271,13 @@ def main():
|
|||
args.sequence_length,
|
||||
args.samples,
|
||||
args.test_times,
|
||||
None, #contiguous
|
||||
args.use_gpu,
|
||||
not args.no_warmup,
|
||||
args.omp_num_threads,
|
||||
args.omp_wait_policy,
|
||||
args.intra_op_num_threads,
|
||||
args.seed,
|
||||
args.verbose,
|
||||
args.contiguous,
|
||||
args.inclusive)
|
||||
args.verbose)
|
||||
|
||||
print("test setting", test_setting)
|
||||
run_performance(model_setting, test_setting, perf_results, args.all)
|
||||
run_performance(model_setting, test_setting, perf_results)
|
||||
|
||||
# Sort the results so that the first one has smallest latency.
|
||||
sorted_results = sorted(perf_results.items(), reverse=False, key=lambda x: x[1])
|
||||
|
|
|
|||
|
|
@ -140,7 +140,8 @@ def generate_test_data(batch_size, sequence_length, test_cases, seed, verbose, i
|
|||
|
||||
|
||||
def get_graph_input_from_embed_node(onnx_model, embed_node, input_index):
|
||||
assert input_index < len(embed_node.input)
|
||||
if input_index >= len(embed_node.input):
|
||||
return None
|
||||
|
||||
input = embed_node.input[input_index]
|
||||
graph_input = onnx_model.find_graph_input(input)
|
||||
|
|
@ -195,6 +196,15 @@ def find_bert_inputs(onnx_model, input_ids_name=None, segment_ids_name=None, inp
|
|||
input_ids = get_graph_input_from_embed_node(onnx_model, embed_node, 0)
|
||||
segment_ids = get_graph_input_from_embed_node(onnx_model, embed_node, 1)
|
||||
input_mask = get_graph_input_from_embed_node(onnx_model, embed_node, 7)
|
||||
|
||||
if input_mask is None:
|
||||
for input in graph_inputs:
|
||||
input_name_lower = input.name.lower()
|
||||
if "mask" in input_name_lower:
|
||||
input_mask = input
|
||||
if input_mask is None:
|
||||
raise ValueError(f"Failed to find attention mask input")
|
||||
|
||||
return input_ids, segment_ids, input_mask
|
||||
|
||||
# Try guess the inputs based on naming.
|
||||
|
|
@ -231,7 +241,7 @@ def get_bert_inputs(onnx_file, input_ids_name=None, segment_ids_name=None, input
|
|||
model.ParseFromString(f.read())
|
||||
|
||||
onnx_model = OnnxModel(model)
|
||||
find_bert_inputs(onnx_model, input_ids_name, segment_ids_name, input_mask_name)
|
||||
return find_bert_inputs(onnx_model, input_ids_name, segment_ids_name, input_mask_name)
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
|
|
|
|||
|
|
@ -21,19 +21,17 @@ from datetime import datetime
|
|||
from onnx import ModelProto, TensorProto, numpy_helper
|
||||
from onnx_model import OnnxModel
|
||||
from bert_test_data import get_bert_inputs, generate_test_data, output_test_data
|
||||
from bert_perf_test import create_session, onnxruntime_inference, setup_openmp_environ
|
||||
from bert_perf_test import create_session, onnxruntime_inference
|
||||
|
||||
|
||||
def run_model(model_path, all_inputs, use_gpu, use_openmp, disable_optimization):
|
||||
# Import onnxruntime shall be after OpenMP environment variable setting.
|
||||
# So we put import here to delay importing.
|
||||
def run_model(model_path, all_inputs, use_gpu, disable_optimization):
|
||||
import onnxruntime
|
||||
|
||||
graph_optimization_level = None
|
||||
if disable_optimization:
|
||||
graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
|
||||
|
||||
intra_op_num_threads = 1 if use_openmp else psutil.cpu_count(logical=False)
|
||||
intra_op_num_threads = psutil.cpu_count(logical=False)
|
||||
|
||||
session = create_session(model_path, use_gpu, intra_op_num_threads, graph_optimization_level)
|
||||
|
||||
|
|
@ -78,7 +76,7 @@ def compare(baseline_results, treatment_results, verbose, rtol=1e-3, atol=1e-4):
|
|||
|
||||
|
||||
def run_test(baseline_model, optimized_model, output_dir, batch_size, sequence_length, use_gpu, test_cases, seed,
|
||||
use_openmp, verbose, rtol, atol, input_ids_name, segment_ids_name, input_mask_name):
|
||||
verbose, rtol, atol, input_ids_name, segment_ids_name, input_mask_name):
|
||||
|
||||
# Try deduce input names from optimized model.
|
||||
input_ids, segment_ids, input_mask = get_bert_inputs(optimized_model, input_ids_name, segment_ids_name,
|
||||
|
|
@ -95,16 +93,9 @@ def run_test(baseline_model, optimized_model, output_dir, batch_size, sequence_l
|
|||
input_mask,
|
||||
random_mask_length=True)
|
||||
|
||||
# OpenMP environment variables must be set before the very first "import onnxruntime"
|
||||
if use_openmp:
|
||||
setup_openmp_environ(omp_num_threads=psutil.cpu_count(logical=False), omp_wait_policy='ACTIVE')
|
||||
else:
|
||||
setup_openmp_environ(omp_num_threads=1, omp_wait_policy='ACTIVE')
|
||||
|
||||
baseline_results, baseline_latency, output_names = run_model(baseline_model,
|
||||
all_inputs,
|
||||
use_gpu,
|
||||
use_openmp,
|
||||
disable_optimization=True)
|
||||
if verbose:
|
||||
print("baseline average latency (all optimizations disabled): {} ms".format(
|
||||
|
|
@ -117,7 +108,6 @@ def run_test(baseline_model, optimized_model, output_dir, batch_size, sequence_l
|
|||
treatment_results, treatment_latency, treatment_output_names = run_model(optimized_model,
|
||||
all_inputs,
|
||||
use_gpu,
|
||||
use_openmp,
|
||||
disable_optimization=False)
|
||||
if verbose:
|
||||
print("treatment average latency: {} ms".format(statistics.mean(treatment_latency) * 1000))
|
||||
|
|
@ -157,9 +147,6 @@ def parse_arguments():
|
|||
parser.add_argument('--use_gpu', required=False, action='store_true', help="use GPU")
|
||||
parser.set_defaults(use_gpu=False)
|
||||
|
||||
parser.add_argument('--openmp', required=False, action='store_true', help="use openmp")
|
||||
parser.set_defaults(openmp=False)
|
||||
|
||||
parser.add_argument('--verbose', required=False, action='store_true', help="print verbose information")
|
||||
parser.set_defaults(verbose=False)
|
||||
|
||||
|
|
@ -180,7 +167,7 @@ def main():
|
|||
path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
run_test(args.baseline_model, args.optimized_model, args.output_dir, args.batch_size, args.sequence_length,
|
||||
args.use_gpu, args.samples, args.seed, args.openmp, args.verbose, args.rtol, args.atol, args.input_ids,
|
||||
args.use_gpu, args.samples, args.seed, args.verbose, args.rtol, args.atol, args.input_ids,
|
||||
args.segment_ids, args.input_mask)
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -23,6 +23,7 @@ import torch
|
|||
import numpy
|
||||
import json
|
||||
from pathlib import Path
|
||||
from packaging import version
|
||||
from transformers import AutoConfig
|
||||
from gpt2_helper import Gpt2Helper, MODEL_CLASSES, DEFAULT_TOLERANCE, PRETRAINED_GPT2_MODELS
|
||||
from gpt2_tester import Gpt2Tester
|
||||
|
|
@ -104,6 +105,10 @@ def parse_arguments():
|
|||
|
||||
|
||||
def main():
|
||||
from transformers import __version__ as transformers_version
|
||||
if version.parse(transformers_version) < version.parse("3.1.0"): # past_key_values name does not exist in 3.0.2 or older
|
||||
raise RuntimeError("This tool requires transformers 3.1.0 or later.")
|
||||
|
||||
args = parse_arguments()
|
||||
setup_logger(args.verbose)
|
||||
|
||||
|
|
|
|||
|
|
@ -7,10 +7,9 @@ REM Please install PyTorch (see https://pytorch.org/) before running this benchm
|
|||
REM GPU: conda install pytorch torchvision cudatoolkit=10.1 -c pytorch
|
||||
REM CPU: conda install pytorch torchvision cpuonly -c pytorch
|
||||
|
||||
REM When run_cli=true, this script is self-contained and you need not copy other files to run benchmarks
|
||||
REM it will use onnxruntime-tools package.
|
||||
REM If run_cli=false, it depends on other python script (*.py) files in this directory.
|
||||
set run_cli=false
|
||||
REM When use_package=true, you need not copy other files to run benchmarks except this sh file.
|
||||
REM Otherwise, it will use python script (*.py) files in this directory.
|
||||
set use_package=false
|
||||
|
||||
REM only need once
|
||||
set run_install=false
|
||||
|
|
@ -72,13 +71,12 @@ if %run_install% == true (
|
|||
)
|
||||
|
||||
pip install --upgrade onnxconverter_common
|
||||
pip install --upgrade onnxruntime-tools
|
||||
pip install --upgrade git+https://github.com/huggingface/transformers
|
||||
pip install --upgrade transformers
|
||||
)
|
||||
|
||||
if %run_cli% == true (
|
||||
echo Use onnxruntime_tools.transformers.benchmark
|
||||
set optimizer_script=-m onnxruntime_tools.transformers.benchmark
|
||||
if %use_package% == true (
|
||||
echo Use onnxruntime.transformers.benchmark
|
||||
set optimizer_script=-m onnxruntime.transformers.benchmark
|
||||
) else (
|
||||
set optimizer_script=benchmark.py
|
||||
)
|
||||
|
|
|
|||
|
|
@ -31,7 +31,7 @@ class GPT2ModelNoPastState(GPT2Model):
|
|||
super().__init__(config)
|
||||
|
||||
def forward(self, input_ids):
|
||||
return super().forward(input_ids, use_cache=False)
|
||||
return super().forward(input_ids, use_cache=False, return_dict=False)
|
||||
|
||||
|
||||
class MyGPT2Model(GPT2Model):
|
||||
|
|
@ -40,11 +40,26 @@ class MyGPT2Model(GPT2Model):
|
|||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
|
||||
@staticmethod
|
||||
def post_process(result, num_layer):
|
||||
if isinstance(result[1][0], tuple) or isinstance(result[1][0], list):
|
||||
assert len(result[1]) == num_layer and len(result[1][0]) == 2 #and len(result[1][0][0].shape) == 4 and result[1][0][0].shape == result[1][0][1].shape
|
||||
present = []
|
||||
for i in range(num_layer):
|
||||
# Since transformers v4.*, past key and values are separated outputs.
|
||||
# Here we concate them into one tensor to be compatible with Attention operator.
|
||||
present.append(torch.cat((result[1][i][0].unsqueeze(0), result[1][i][1].unsqueeze(0)), dim=0))
|
||||
return (result[0], tuple(present))
|
||||
|
||||
return result
|
||||
|
||||
def forward(self, input_ids, position_ids, attention_mask, *past):
|
||||
return super().forward(input_ids,
|
||||
position_ids=position_ids,
|
||||
attention_mask=attention_mask,
|
||||
past_key_values=past)
|
||||
result = super().forward(input_ids,
|
||||
position_ids=position_ids,
|
||||
attention_mask=attention_mask,
|
||||
past_key_values=past,
|
||||
return_dict=False)
|
||||
return MyGPT2Model.post_process(result, self.config.n_layer)
|
||||
|
||||
|
||||
class MyGPT2LMHeadModel(GPT2LMHeadModel):
|
||||
|
|
@ -54,10 +69,13 @@ class MyGPT2LMHeadModel(GPT2LMHeadModel):
|
|||
super().__init__(config)
|
||||
|
||||
def forward(self, input_ids, position_ids, attention_mask, *past):
|
||||
return super().forward(input_ids,
|
||||
position_ids=position_ids,
|
||||
attention_mask=attention_mask,
|
||||
past_key_values=past)
|
||||
result = super().forward(input_ids,
|
||||
position_ids=position_ids,
|
||||
attention_mask=attention_mask,
|
||||
past_key_values=past,
|
||||
return_dict=False)
|
||||
|
||||
return MyGPT2Model.post_process(result, self.config.n_layer)
|
||||
|
||||
|
||||
class MyGPT2LMHeadModel_NoPadding(GPT2LMHeadModel):
|
||||
|
|
@ -216,6 +234,7 @@ class Gpt2Helper:
|
|||
|
||||
is_all_close = is_close
|
||||
num_layers = len(ort_outputs) - 1
|
||||
|
||||
for layer in range(num_layers):
|
||||
is_close = numpy.allclose(ort_outputs[1 + layer],
|
||||
torch_outputs[1][layer].cpu().numpy(),
|
||||
|
|
@ -288,10 +307,12 @@ class Gpt2Helper:
|
|||
input_names.append('attention_mask')
|
||||
input_names.extend(past_names)
|
||||
|
||||
assert len(outputs) == 2 and len(outputs[1]) == num_layer
|
||||
|
||||
logger.info(
|
||||
f"Shapes: input_ids={dummy_inputs.input_ids.shape} past={dummy_inputs.past[0].shape} output={outputs[0].shape} present={outputs[1][0].shape}"
|
||||
)
|
||||
|
||||
|
||||
Path(onnx_model_path).parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
torch.onnx.export(model,
|
||||
|
|
|
|||
|
|
@ -26,67 +26,44 @@ MODELS = {
|
|||
"bert-base-uncased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
|
||||
"bert-large-uncased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
|
||||
"bert-base-cased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
|
||||
"bert-large-cased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
|
||||
"bert-base-multilingual-uncased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
|
||||
"bert-base-multilingual-cased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
|
||||
"bert-base-chinese": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
|
||||
"bert-base-german-cased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
|
||||
"bert-large-uncased-whole-word-masking": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
|
||||
"bert-large-cased-whole-word-masking": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
|
||||
"bert-large-uncased-whole-word-masking-finetuned-squad": (["input_ids", "attention_mask",
|
||||
"token_type_ids"], 11, False, "bert"),
|
||||
"bert-large-cased-whole-word-masking-finetuned-squad": (["input_ids", "attention_mask",
|
||||
"token_type_ids"], 11, False, "bert"),
|
||||
"bert-base-cased-finetuned-mrpc": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
|
||||
"bert-base-german-dbmdz-cased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
|
||||
"bert-base-german-dbmdz-uncased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
|
||||
# todo: more models to add
|
||||
# GPT
|
||||
"openai-gpt": (["input_ids"], 11, False, "gpt2"), # no past state inputs
|
||||
# GPT-2
|
||||
"gpt2": (["input_ids"], 11, False, "gpt2"), # no past state inputs & outputs
|
||||
|
||||
# GPT (no past state)
|
||||
"openai-gpt": (["input_ids"], 11, False, "gpt2"),
|
||||
# GPT-2 (no past state, use benchmark_gpt2.py for past_key_values)
|
||||
"gpt2": (["input_ids"], 11, False, "gpt2"),
|
||||
"gpt2-medium": (["input_ids"], 11, False, "gpt2"),
|
||||
"gpt2-large":
|
||||
(["input_ids"], 11, True,
|
||||
"gpt2"), # Model>2GB. Need use_external_data_format=True to export it. No past state inputs for GPT models.
|
||||
"gpt2-large": (["input_ids"], 11, True, "gpt2"),
|
||||
"gpt2-xl": (["input_ids"], 11, True, "gpt2"),
|
||||
"distilgpt2": (["input_ids"], 11, False, "gpt2"), # no past state inputs & outputs
|
||||
"distilgpt2": (["input_ids"], 11, False, "gpt2"),
|
||||
# Transformer-XL
|
||||
#"transfo-xl-wt103": (["input_ids"], 11, False, "bert"),
|
||||
# XLNet
|
||||
#"xlnet-base-cased": (["input_ids"], 12, False, "bert"), # Models uses Einsum, which need opset version 12 and PyTorch 1.5.0 or above.
|
||||
#"xlnet-large-cased": (["input_ids"], 12, False, "bert"), # Models uses Einsum, which need opset version 12 and PyTorch 1.5.0 or above.
|
||||
"xlnet-base-cased": (["input_ids"], 12, False, "bert"),
|
||||
"xlnet-large-cased": (["input_ids"], 12, False, "bert"),
|
||||
# XLM
|
||||
"xlm-mlm-en-2048": (["input_ids"], 11, True, "bert"),
|
||||
"xlm-mlm-ende-1024": (["input_ids"], 11, False, "bert"),
|
||||
"xlm-mlm-enfr-1024": (["input_ids"], 11, False, "bert"),
|
||||
"xlm-mlm-enro-1024": (["input_ids"], 11, False, "bert"),
|
||||
"xlm-mlm-xnli15-1024": (["input_ids"], 11, False, "bert"),
|
||||
"xlm-mlm-tlm-xnli15-1024": (["input_ids"], 11, False, "bert"),
|
||||
"xlm-clm-enfr-1024": (["input_ids"], 11, False, "bert"),
|
||||
"xlm-clm-ende-1024": (["input_ids"], 11, False, "bert"),
|
||||
"xlm-mlm-17-1280": (["input_ids"], 11, True, "bert"),
|
||||
"xlm-mlm-100-1280": (["input_ids"], 11, True, "bert"),
|
||||
# XML Roberta
|
||||
"xlm-roberta-base": (["input_ids"], 12, False, "bert"),
|
||||
# RoBERTa
|
||||
"roberta-base": (["input_ids", "attention_mask"], 11, False, "bert"),
|
||||
"roberta-large": (["input_ids", "attention_mask"], 11, False, "bert"),
|
||||
"roberta-large-mnli": (["input_ids", "attention_mask"], 11, False, "bert"),
|
||||
"deepset/roberta-base-squad2": (["input_ids", "attention_mask"], 11, False, "bert"),
|
||||
"distilroberta-base": (["input_ids", "attention_mask"], 11, False, "bert"),
|
||||
"roberta-base-openai-detector": (["input_ids", "attention_mask"], 11, False, "bert"),
|
||||
"roberta-large-openai-detector": (["input_ids", "attention_mask"], 11, False, "bert"),
|
||||
|
||||
# DistilBERT
|
||||
"distilbert-base-uncased": (["input_ids", "attention_mask"], 11, False, "bert"),
|
||||
"distilbert-base-uncased-distilled-squad": (["input_ids", "attention_mask"], 11, False, "bert"),
|
||||
"distilbert-base-cased": (["input_ids", "attention_mask"], 11, False, "bert"),
|
||||
"distilbert-base-cased-distilled-squad": (["input_ids", "attention_mask"], 11, False, "bert"),
|
||||
"distilbert-base-german-cased": (["input_ids", "attention_mask"], 11, False, "bert"),
|
||||
"distilbert-base-multilingual-cased": (["input_ids", "attention_mask"], 11, False, "bert"),
|
||||
# CTRL
|
||||
"ctrl": (["input_ids"], 11, True, "bert"),
|
||||
# CamemBERT
|
||||
"camembert-base": (["input_ids"], 11, False, "bert"),
|
||||
# ALBERT
|
||||
# Models uses Einsum, which need opset version 12 and PyTorch 1.5.0 or above.
|
||||
"albert-base-v1": (["input_ids"], 12, False, "bert"),
|
||||
"albert-large-v1": (["input_ids"], 12, False, "bert"),
|
||||
"albert-xlarge-v1": (["input_ids"], 12, True, "bert"),
|
||||
|
|
@ -95,36 +72,37 @@ MODELS = {
|
|||
"albert-large-v2": (["input_ids"], 12, False, "bert"),
|
||||
"albert-xlarge-v2": (["input_ids"], 12, True, "bert"),
|
||||
#"albert-xxlarge-v2": (["input_ids"], 12, True, "bert"),
|
||||
# T5
|
||||
"t5-small": (["input_ids"], 12, False, "bert"),
|
||||
"t5-base": (["input_ids"], 12, False, "bert"),
|
||||
"t5-large": (["input_ids"], 12, True, "bert"),
|
||||
"t5-3b": (["input_ids"], 12, True, "bert"),
|
||||
"t5-11b": (["input_ids"], 12, True, "bert"),
|
||||
# T5 (use benchmark_t5.py instead)
|
||||
#"t5-small": (["input_ids"], 12, False, "bert"),
|
||||
#"t5-base": (["input_ids"], 12, False, "bert"),
|
||||
#"t5-large": (["input_ids"], 12, True, "bert"),
|
||||
#"t5-3b": (["input_ids"], 12, True, "bert"),
|
||||
#"t5-11b": (["input_ids"], 12, True, "bert"),
|
||||
#"valhalla/t5-small-qa-qg-hl": (["input_ids"], 12, True, "bert"),
|
||||
# XLM-RoBERTa
|
||||
"xlm-roberta-base": (["input_ids"], 11, False, "bert"),
|
||||
"xlm-roberta-large": (["input_ids"], 11, True, "bert"),
|
||||
# FlauBERT
|
||||
"flaubert/flaubert_small_cased": (["input_ids"], 11, False, "bert"),
|
||||
"flaubert/flaubert_base_uncased": (["input_ids"], 11, False, "bert"),
|
||||
#"flaubert/flaubert_base_uncased": (["input_ids"], 11, False, "bert"),
|
||||
"flaubert/flaubert_base_cased": (["input_ids"], 11, False, "bert"),
|
||||
"flaubert/flaubert_large_cased": (["input_ids"], 11, False, "bert"),
|
||||
#"flaubert/flaubert_large_cased": (["input_ids"], 11, False, "bert"),
|
||||
# Bart
|
||||
"facebook/bart-large": (["input_ids"], 11, False, "bert"),
|
||||
"facebook/bart-base": (["input_ids"], 11, False, "bert"),
|
||||
"facebook/bart-large-mnli": (["input_ids"], 11, False, "bert"),
|
||||
"facebook/bart-large-cnn": (["input_ids"], 11, False, "bert"),
|
||||
#"facebook/mbart-large-en-ro": (["input_ids"], 11, True, "bert"),
|
||||
|
||||
# DialoGPT
|
||||
"microsoft/DialoGPT-small": (["input_ids"], 11, False, "gpt2"),
|
||||
"microsoft/DialoGPT-medium": (["input_ids"], 11, False, "gpt2"),
|
||||
"microsoft/DialoGPT-large": (["input_ids"], 11, True, "gpt2"),
|
||||
#"microsoft/DialoGPT-large": (["input_ids"], 11, True, "gpt2"),
|
||||
# Reformer
|
||||
#"google/reformer-enwik8": (["input_ids"], 11, False, "bert"),
|
||||
#"google/reformer-crime-and-punishment": (["input_ids"], 11, False, "bert"),
|
||||
# MarianMT
|
||||
#"Helsinki-NLP/opus-mt-ROMANCE-en": (["input_ids"], 12, False, "bert"),
|
||||
# Longformer
|
||||
# Longformer (use benchmark_longformer.py instead)
|
||||
#"allenai/longformer-base-4096": (["input_ids"], 12, False, "bert"),
|
||||
#"allenai/longformer-large-4096": (["input_ids"], 12, False, "bert"),
|
||||
}
|
||||
|
|
|
|||
|
|
@ -5,13 +5,12 @@
|
|||
# --------------------------------------------------------------------------
|
||||
# This measures the performance of OnnxRuntime, PyTorch and TorchScript on transformer models.
|
||||
# Please install PyTorch (see https://pytorch.org/) before running this benchmark. Like the following:
|
||||
# GPU: conda install pytorch torchvision cudatoolkit=10.1 -c pytorch
|
||||
# GPU: conda install pytorch torchvision cudatoolkit=11.0 -c pytorch
|
||||
# CPU: conda install pytorch torchvision cpuonly -c pytorch
|
||||
|
||||
# When run_cli=true, this script is self-contained and you need not copy other files to run benchmarks
|
||||
# it will use onnxruntime-tools package.
|
||||
# If run_cli=false, it depends on other python script (*.py) files in this directory.
|
||||
run_cli=true
|
||||
# When use_package=true, you need not copy other files to run benchmarks except this sh file.
|
||||
# Otherwise, it will use python script (*.py) files in this directory.
|
||||
use_package=true
|
||||
|
||||
# only need once
|
||||
run_install=true
|
||||
|
|
@ -50,7 +49,7 @@ sequence_lengths="8 16 32 64 128 256 512 1024"
|
|||
input_counts=1
|
||||
|
||||
# Pretrained transformers models can be a subset of: bert-base-cased roberta-base gpt2 distilgpt2 distilbert-base-uncased
|
||||
models_to_test="bert-base-cased roberta-base gpt2"
|
||||
models_to_test="bert-base-cased roberta-base distilbert-base-uncased"
|
||||
|
||||
# If you have mutliple GPUs, you can choose one GPU for test. Here is an example to use the second GPU:
|
||||
# export CUDA_VISIBLE_DEVICES=1
|
||||
|
|
@ -81,7 +80,7 @@ fi
|
|||
|
||||
|
||||
if [ "$run_install" = true ] ; then
|
||||
pip uninstall --yes ort_nightly
|
||||
pip uninstall --yes ort-nightly ort-gpu-nightly
|
||||
pip uninstall --yes onnxruntime
|
||||
pip uninstall --yes onnxruntime-gpu
|
||||
if [ "$run_cpu_fp32" = true ] || [ "$run_cpu_int8" = true ]; then
|
||||
|
|
@ -89,14 +88,12 @@ if [ "$run_install" = true ] ; then
|
|||
else
|
||||
pip install onnxruntime-gpu
|
||||
fi
|
||||
pip install --upgrade onnxconverter_common
|
||||
pip install --upgrade onnxruntime-tools
|
||||
pip install --upgrade transformers
|
||||
pip install --upgrade onnx coloredlogs packaging psutil py3nvml onnxconverter_common numpy transformers
|
||||
fi
|
||||
|
||||
if [ "$run_cli" = true ] ; then
|
||||
echo "Use onnxruntime_tools.transformers.benchmark"
|
||||
benchmark_script="-m onnxruntime_tools.transformers.benchmark"
|
||||
if [ "$use_package" = true ] ; then
|
||||
echo "Use onnxruntime.transformers.benchmark"
|
||||
benchmark_script="-m onnxruntime.transformers.benchmark"
|
||||
else
|
||||
benchmark_script="benchmark.py"
|
||||
fi
|
||||
|
|
@ -187,4 +184,4 @@ fi
|
|||
# Remove duplicated lines
|
||||
awk '!x[$0]++' ./result.csv > summary_result.csv
|
||||
awk '!x[$0]++' ./fusion.csv > summary_fusion.csv
|
||||
awk '!x[$0]++' ./detail.csv > summary_detail.csv
|
||||
awk '!x[$0]++' ./detail.csv > summary_detail.csv
|
||||
|
|
|
|||
|
|
@ -26,7 +26,7 @@ class TestGpt2(unittest.TestCase):
|
|||
|
||||
def test_gpt2_fp16(self):
|
||||
if 'CUDAExecutionProvider' in onnxruntime.get_available_providers():
|
||||
self.run_benchmark_gpt2('-m gpt2 --precision fp16 -o -b 1 -s 128')
|
||||
self.run_benchmark_gpt2('-m gpt2 --precision fp16 -o -b 1 -s 128 --use_gpu')
|
||||
|
||||
def test_gpt2_int8(self):
|
||||
self.run_benchmark_gpt2('-m gpt2 --precision int8 -o -b 1 -s 128')
|
||||
|
|
|
|||
Loading…
Reference in a new issue