mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-07-02 03:55:34 +00:00
update benchmark_longformer for default test suite (#6772)
This commit is contained in:
parent
8a6f6bc38b
commit
8703e2c778
3 changed files with 193 additions and 151 deletions
|
|
@ -297,10 +297,8 @@ def measure_memory(is_gpu, func):
|
|||
return None
|
||||
|
||||
monitor = MemoryMonitor(False)
|
||||
if is_gpu:
|
||||
print(f"GPU memory usage before testing: {monitor.measure_gpu_usage()}")
|
||||
else:
|
||||
print(f"Peak CPU memory usage before testing: {monitor.measure_cpu_usage():.2f} MB")
|
||||
|
||||
memory_before_test = monitor.measure_gpu_usage() if is_gpu else monitor.measure_cpu_usage()
|
||||
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
with ThreadPoolExecutor() as executor:
|
||||
|
|
@ -312,8 +310,15 @@ def measure_memory(is_gpu, func):
|
|||
finally:
|
||||
monitor.keep_measuring = False
|
||||
max_usage = mem_thread.result()
|
||||
|
||||
if is_gpu:
|
||||
print(f"Peak GPU memory usage: {max_usage}")
|
||||
print(f"GPU memory usage: before={memory_before_test} peak={max_usage}")
|
||||
if len(memory_before_test) >= 1 and len(max_usage) >= 1:
|
||||
before = memory_before_test[0]["max_used_MB"]
|
||||
after = max_usage[0]["max_used_MB"]
|
||||
return after - before
|
||||
else:
|
||||
return None
|
||||
else:
|
||||
print(f"Peak CPU memory usage: {max_usage:.2f} MB")
|
||||
return max_usage
|
||||
print(f"CPU memory usage: before={memory_before_test:.1f} MB, peak={max_usage:.1f} MB")
|
||||
return max_usage - memory_before_test
|
||||
|
|
|
|||
|
|
@ -13,17 +13,20 @@
|
|||
# cd ../torch_extensions
|
||||
# python setup.py install
|
||||
# cd ../longformer
|
||||
# python convert_longformer_to_onnx.py --model longformer-base-4096 --precision fp32 --optimize_onnx
|
||||
# python convert_longformer_to_onnx.py --model longformer-base-4096 --precision fp16 --optimize_onnx
|
||||
#
|
||||
# When there is no parameter, all avaiable tests (memory & latency) will run on the longformer-base-4096 pretrained model.
|
||||
# python benchmark_longformer.py
|
||||
#
|
||||
# Benchmark the latency (Exported onnx model is in the current directory):
|
||||
# python benchmark_longformer.py --models longformer-base-4096 --batch_sizes 1 --sequence_lengths 512 1024 2048 4096 --global_lengths 8 --onnx_dir . --validate_onnx -t 100
|
||||
# python benchmark_longformer.py --model longformer-base-4096 --batch_sizes 1 --sequence_lengths 512 1024 2048 4096 --global_lengths 8 --onnx ./longformer-base-4096_fp16.onnx --validate_onnx -t 100
|
||||
#
|
||||
# Benchmark GPU peak memory:
|
||||
# export ORT_LONGFORMER_COMPACT_MEMORY=0
|
||||
# python benchmark_longformer.py --models longformer-base-4096 --batch_sizes 1 --sequence_lengths 4096 --global_lengths 8 --onnx_dir . --memory -t 10
|
||||
# python benchmark_longformer.py --model longformer-base-4096 --batch_sizes 1 --sequence_lengths 4096 --global_lengths 8 --onnx_dir . --memory -t 10
|
||||
# export ORT_LONGFORMER_COMPACT_MEMORY=1
|
||||
# python benchmark_longformer.py --models longformer-base-4096 --batch_sizes 1 --sequence_lengths 4096 --global_lengths 8 --onnx_dir . --memory -t 10
|
||||
# By default, compact memory kernel is not enabled since it is slower. You need set an environment variable ORT_LONGFORMER_COMPACT_MEMORY=1 to enable it, which uses less memory in this test.
|
||||
# python benchmark_longformer.py --model longformer-base-4096 --batch_sizes 1 --sequence_lengths 4096 --global_lengths 8 --onnx_dir . --memory -t 10
|
||||
# By default, compact memory kernel is not enabled. You need set an environment variable ORT_LONGFORMER_COMPACT_MEMORY=1 to enable it.
|
||||
|
||||
import timeit
|
||||
from datetime import datetime
|
||||
|
|
@ -34,7 +37,6 @@ import sys
|
|||
import torch
|
||||
import onnxruntime
|
||||
import numpy as np
|
||||
import pprint
|
||||
import math
|
||||
|
||||
from longformer_helper import LongformerHelper, PRETRAINED_LONGFORMER_MODELS
|
||||
|
|
@ -67,12 +69,14 @@ def test_torch_latency(device, model, model_name, batch_sizes, sequence_lengths,
|
|||
"precision": "fp32",
|
||||
"io_binding": "",
|
||||
"model_name": model_name,
|
||||
"description": model_name + "[torch]",
|
||||
"inputs": 3,
|
||||
"threads": num_threads,
|
||||
"batch_size": batch_size,
|
||||
"sequence_length": sequence_length,
|
||||
"global_length": global_length,
|
||||
"datetime": str(datetime.now()),
|
||||
"memory": "?",
|
||||
}
|
||||
result.update(benchmark_helper.get_latency_result(runtimes, batch_size))
|
||||
|
||||
|
|
@ -96,11 +100,13 @@ def test_parity(device, model, ort_session, batch_size, sequence_length, global_
|
|||
if verbose and (math.isnan(max_diff) or max_diff > 0.001):
|
||||
print("torch last_state:", torch_outputs[0])
|
||||
print("ort last_state:", ort_outputs[0])
|
||||
return max_diff
|
||||
|
||||
|
||||
def test_ort_latency(device,
|
||||
model,
|
||||
model_name,
|
||||
description,
|
||||
ort_session,
|
||||
batch_sizes,
|
||||
sequence_lengths,
|
||||
|
|
@ -135,6 +141,7 @@ def test_ort_latency(device,
|
|||
|
||||
result_template = {
|
||||
"model_name": model_name,
|
||||
"description": description,
|
||||
"inputs": 3,
|
||||
"engine": "OnnxRuntime",
|
||||
"version": onnxruntime.__version__,
|
||||
|
|
@ -147,6 +154,7 @@ def test_ort_latency(device,
|
|||
"global_length": global_length,
|
||||
"test_times": test_times,
|
||||
"datetime": str(datetime.now()),
|
||||
"memory": "",
|
||||
}
|
||||
|
||||
if not disable_io_binding:
|
||||
|
|
@ -172,12 +180,12 @@ def test_ort_latency(device,
|
|||
repeat_times=test_times,
|
||||
batch_size=batch_size)
|
||||
|
||||
pprint.pprint(result)
|
||||
results.append(result)
|
||||
|
||||
if validate_onnx:
|
||||
test_parity(device, model, ort_session, batch_size, sequence_length, global_length, verbose)
|
||||
max_diff = test_parity(device, model, ort_session, batch_size, sequence_length, global_length,
|
||||
verbose)
|
||||
result["description"] += f"(max_diff={max_diff})"
|
||||
|
||||
results.append(result)
|
||||
return results
|
||||
|
||||
|
||||
|
|
@ -198,83 +206,120 @@ def test_ort_memory(device, onnx_model_path, batch_size, sequence_length, global
|
|||
for _ in range(test_times):
|
||||
ort_outputs = session.run(None, ort_inputs)
|
||||
|
||||
benchmark_helper.measure_memory(is_gpu=True, func=inference)
|
||||
print("Memory test is done")
|
||||
memory_used = benchmark_helper.measure_memory(is_gpu=True, func=inference)
|
||||
|
||||
return {
|
||||
"onnx_model": onnx_model_path,
|
||||
"batch_size": batch_size,
|
||||
"sequence_length": sequence_length,
|
||||
"global_length": global_length,
|
||||
"test_times": test_times,
|
||||
"num_threads": num_threads,
|
||||
"memory": memory_used
|
||||
}
|
||||
|
||||
|
||||
def test_all(args):
|
||||
# Currently, the longformer attention operator could only run in GPU (no CPU implementation yet).
|
||||
device = torch.device('cuda:0')
|
||||
def load_torch_model(model_name, device):
|
||||
torch_model_name_or_dir = PRETRAINED_LONGFORMER_MODELS[
|
||||
model_name] if model_name in PRETRAINED_LONGFORMER_MODELS else model_name
|
||||
|
||||
results = []
|
||||
for model_name in args.models:
|
||||
# Here we run an example input
|
||||
from transformers import LongformerModel
|
||||
torch_model_name_or_dir = PRETRAINED_LONGFORMER_MODELS[model_name]
|
||||
model = LongformerModel.from_pretrained(torch_model_name_or_dir) # pretrained model name or directory
|
||||
model.to(device)
|
||||
from transformers import LongformerModel
|
||||
model = LongformerModel.from_pretrained(torch_model_name_or_dir)
|
||||
model.to(device)
|
||||
return model
|
||||
|
||||
# Search onnx model in the following order: optimized fp16 model, optimized fp32 model, raw model
|
||||
# TODO: call convert_longformer_to_onnx to export onnx instead.
|
||||
import os.path
|
||||
optimized = False
|
||||
precision = 'fp32'
|
||||
onnx_model_path = os.path.join(args.onnx_dir, model_name + ".onnx")
|
||||
optimized_fp32_model = os.path.join(args.onnx_dir, model_name + "_fp32.onnx")
|
||||
optimized_fp16_model = os.path.join(args.onnx_dir, model_name + "_fp16.onnx")
|
||||
if os.path.isfile(optimized_fp16_model):
|
||||
onnx_model_path = optimized_fp16_model
|
||||
optimized = True
|
||||
precision = 'fp16'
|
||||
elif os.path.isfile(optimized_fp32_model):
|
||||
onnx_model_path = optimized_fp32_model
|
||||
optimized = True
|
||||
print("ONNX model path:", onnx_model_path)
|
||||
|
||||
for num_threads in args.num_threads:
|
||||
if "torch" in args.engines:
|
||||
results += test_torch_latency(device, model, model_name, args.batch_sizes, args.sequence_lengths,
|
||||
args.global_lengths, args.test_times, num_threads, args.verbose)
|
||||
def find_onnx_model(model_name, onnx_dir='.'):
|
||||
# Search onnx model in the following order: optimized fp16 model, optimized fp32 model, raw model
|
||||
# TODO: call convert_longformer_to_onnx to export onnx instead.
|
||||
import os.path
|
||||
onnx_model_path = os.path.join(onnx_dir, model_name + ".onnx")
|
||||
optimized_fp32_model = os.path.join(onnx_dir, model_name + "_fp32.onnx")
|
||||
optimized_fp16_model = os.path.join(onnx_dir, model_name + "_fp16.onnx")
|
||||
if os.path.isfile(optimized_fp16_model):
|
||||
onnx_model_path = optimized_fp16_model
|
||||
elif os.path.isfile(optimized_fp32_model):
|
||||
onnx_model_path = optimized_fp32_model
|
||||
return onnx_model_path
|
||||
|
||||
if "onnxruntime" in args.engines:
|
||||
if args.memory:
|
||||
test_ort_memory(device, onnx_model_path, args.batch_sizes[0], args.sequence_lengths[0],
|
||||
args.global_lengths[0], args.test_times, num_threads)
|
||||
else: # test latency
|
||||
session = benchmark_helper.create_onnxruntime_session(onnx_model_path,
|
||||
use_gpu=True,
|
||||
enable_all_optimization=True,
|
||||
num_threads=num_threads)
|
||||
if session is None:
|
||||
raise RuntimeError(f"Failed to create ORT sesssion from ONNX file {onnx_model_path}")
|
||||
|
||||
results += test_ort_latency(device, model, model_name, session, args.batch_sizes,
|
||||
args.sequence_lengths, args.global_lengths, args.test_times,
|
||||
num_threads, optimized, precision, args.validate_onnx,
|
||||
args.disable_io_binding, args.verbose)
|
||||
return results
|
||||
def test_memory(args, device):
|
||||
if len(args.batch_sizes) > 1:
|
||||
raise RuntimeError("For memory test, only one batch_size (-b) is allowed.")
|
||||
if len(args.sequence_lengths) > 1:
|
||||
raise RuntimeError("For memory test, only one sequence_length (-s) is allowed.")
|
||||
if len(args.global_lengths) > 1:
|
||||
raise RuntimeError("For memory test, only one global_length (-g) is allowed.")
|
||||
|
||||
model_name = args.model
|
||||
onnx_model_path = find_onnx_model(model_name) if not args.onnx else args.onnx
|
||||
|
||||
torch.cuda.empty_cache()
|
||||
return test_ort_memory(device, onnx_model_path, args.batch_sizes[0], args.sequence_lengths[0],
|
||||
args.global_lengths[0], args.test_times, args.num_threads)
|
||||
|
||||
|
||||
def test_ort(args, device):
|
||||
model_name = args.model
|
||||
|
||||
onnx_model_path = find_onnx_model(model_name) if not args.onnx else args.onnx
|
||||
|
||||
optimized = onnx_model_path.endswith("_fp16.onnx") or onnx_model_path.endswith("_fp32.onnx")
|
||||
precision = 'fp32' if not onnx_model_path.endswith("_fp16.onnx") else 'fp16'
|
||||
|
||||
model = load_torch_model(model_name, device)
|
||||
|
||||
num_threads = args.num_threads
|
||||
|
||||
session = benchmark_helper.create_onnxruntime_session(onnx_model_path,
|
||||
use_gpu=True,
|
||||
enable_all_optimization=True,
|
||||
num_threads=num_threads)
|
||||
if session is None:
|
||||
raise RuntimeError(f"Failed to create ORT sesssion from ONNX file {onnx_model_path}")
|
||||
|
||||
description = onnx_model_path
|
||||
if (os.environ.get('ORT_LONGFORMER_COMPACT_MEMORY', '0') == "1"):
|
||||
description += "[compact_memory]"
|
||||
|
||||
return test_ort_latency(device, model, model_name, description, session, args.batch_sizes, args.sequence_lengths,
|
||||
args.global_lengths, args.test_times, num_threads, optimized, precision, args.validate_onnx,
|
||||
args.disable_io_binding, args.verbose)
|
||||
|
||||
|
||||
def test_torch(args, device):
|
||||
model = load_torch_model(args.model, device)
|
||||
return test_torch_latency(device, model, args.model, args.batch_sizes, args.sequence_lengths, args.global_lengths,
|
||||
args.test_times, args.num_threads, args.verbose)
|
||||
|
||||
|
||||
def test_latency(args, device):
|
||||
if "onnxruntime" == args.engine:
|
||||
return test_ort(args, device)
|
||||
elif "torch" == args.engine:
|
||||
return test_torch(args, device)
|
||||
|
||||
raise RuntimeError("unknown engine " + args.engine)
|
||||
|
||||
|
||||
def parse_arguments(argv=None):
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument("-m",
|
||||
"--models",
|
||||
"--model",
|
||||
required=False,
|
||||
nargs="+",
|
||||
type=str,
|
||||
default=["longformer-base-4096"],
|
||||
default="longformer-base-4096",
|
||||
help="Checkpoint directory or pre-trained model names in the list: " +
|
||||
", ".join(PRETRAINED_LONGFORMER_MODELS.keys()))
|
||||
|
||||
parser.add_argument("-e",
|
||||
"--engines",
|
||||
"--engine",
|
||||
required=False,
|
||||
nargs="+",
|
||||
type=str,
|
||||
default=['onnxruntime'],
|
||||
default='onnxruntime',
|
||||
choices=['onnxruntime', 'torch'],
|
||||
help="Engines to benchmark. For large model, recommend to test only one engine at a time.")
|
||||
help="Engine to benchmark.")
|
||||
|
||||
parser.add_argument("-t",
|
||||
"--test_times",
|
||||
|
|
@ -297,11 +342,7 @@ def parse_arguments(argv=None):
|
|||
"Sequence lengths. It could have multiple values in latency test. If --export_padding is not used in exporting onnx model, sequence length shall be multiple of window size."
|
||||
)
|
||||
|
||||
parser.add_argument("--onnx_dir",
|
||||
required=False,
|
||||
type=str,
|
||||
default=os.path.join('.', 'onnx_models'),
|
||||
help="Directory to search onnx models.")
|
||||
parser.add_argument("--onnx", required=False, type=str, default=None, help="Onnx model path")
|
||||
|
||||
parser.add_argument("-g",
|
||||
"--global_lengths",
|
||||
|
|
@ -310,13 +351,7 @@ def parse_arguments(argv=None):
|
|||
default=[0],
|
||||
help="Number of global tokens. It could have multiple values in latency test.")
|
||||
|
||||
parser.add_argument("-n",
|
||||
"--num_threads",
|
||||
required=False,
|
||||
nargs="+",
|
||||
type=int,
|
||||
default=[0],
|
||||
help="Threads to use. It could have multiple values in latency test.")
|
||||
parser.add_argument("-n", "--num_threads", required=False, type=int, default=0, help="Threads to use.")
|
||||
|
||||
parser.add_argument("-v",
|
||||
"--validate_onnx",
|
||||
|
|
@ -331,78 +366,35 @@ def parse_arguments(argv=None):
|
|||
parser.add_argument("--verbose", required=False, action="store_true", help="Print more information.")
|
||||
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
return args
|
||||
|
||||
|
||||
def output_summary(results, csv_filename, args):
|
||||
with open(csv_filename, mode="a", newline='') as csv_file:
|
||||
header_names = [
|
||||
"model_name", "inputs", "engine", "version", "device", "precision", "optimizer", "io_binding", "threads"
|
||||
]
|
||||
data_names = []
|
||||
for batch_size in args.batch_sizes:
|
||||
for sequence_length in args.sequence_lengths:
|
||||
for global_length in args.global_lengths:
|
||||
data_names.append(f"b{batch_size}_s{sequence_length}_g{global_length}")
|
||||
|
||||
csv_writer = csv.DictWriter(csv_file, fieldnames=header_names + data_names)
|
||||
csv_writer.writeheader()
|
||||
for model in args.models:
|
||||
for input_count in [1, 2, 3]:
|
||||
for engine_name in args.engines:
|
||||
for io_binding in [True, False, ""]:
|
||||
for threads in args.num_threads:
|
||||
row = {}
|
||||
for result in results:
|
||||
if result["model_name"] == model and result["inputs"] == input_count and \
|
||||
result["engine"] == engine_name and result["io_binding"] == io_binding and \
|
||||
result["threads"] == threads:
|
||||
headers = {k: v for k, v in result.items() if k in header_names}
|
||||
if not row:
|
||||
row.update(headers)
|
||||
row.update({k: "" for k in data_names})
|
||||
else:
|
||||
for k in header_names:
|
||||
assert row[k] == headers[k]
|
||||
b = result["batch_size"]
|
||||
s = result["sequence_length"]
|
||||
g = result["global_length"]
|
||||
row[f"b{b}_s{s}_g{g}"] = result["average_latency_ms"]
|
||||
if row:
|
||||
csv_writer.writerow(row)
|
||||
|
||||
print(f"Summary results are saved to csv file: {csv_filename}")
|
||||
|
||||
|
||||
def output_details(results, csv_filename):
|
||||
latency_results = [result for result in results if 'average_latency_ms' in result]
|
||||
if len(latency_results) == 0:
|
||||
print("No latency results for output.")
|
||||
return
|
||||
|
||||
with open(csv_filename, mode="a", newline='') as csv_file:
|
||||
column_names = [
|
||||
"engine", "version", "device", "precision", "optimizer", "io_binding", "model_name", "inputs", "threads",
|
||||
"batch_size", "sequence_length", "global_length", "datetime", "test_times", "QPS", "average_latency_ms",
|
||||
"latency_variance", "latency_90_percentile", "latency_95_percentile", "latency_99_percentile"
|
||||
"datetime", "test_times", "description", "batch_size", "sequence_length", "global_length", "memory", "QPS",
|
||||
"average_latency_ms", "latency_variance", "latency_90_percentile", "latency_95_percentile",
|
||||
"latency_99_percentile"
|
||||
]
|
||||
|
||||
csv_writer = csv.DictWriter(csv_file, fieldnames=column_names)
|
||||
csv_writer.writeheader()
|
||||
for result in results:
|
||||
for result in latency_results:
|
||||
print(
|
||||
f"b={result['batch_size']}, s={result['sequence_length']}, g={result['global_length']}, latency={result['average_latency_ms']}ms, memory={result['memory']}MB {result['description']}"
|
||||
)
|
||||
csv_writer.writerow(result)
|
||||
|
||||
print(f"Detail results are saved to csv file: {csv_filename}")
|
||||
|
||||
|
||||
def main(args):
|
||||
assert len(args.models) == 1, "run only one model at a time"
|
||||
|
||||
if args.memory:
|
||||
if len(args.batch_sizes) > 1:
|
||||
raise RuntimeError("For memory test, only one batch_size (-b) is allowed.")
|
||||
if len(args.sequence_lengths) > 1:
|
||||
raise RuntimeError("For memory test, only one sequence_length (-s) is allowed.")
|
||||
if len(args.global_lengths) > 1:
|
||||
raise RuntimeError("For memory test, only one global_length (-g) is allowed.")
|
||||
if len(args.num_threads) > 1:
|
||||
raise RuntimeError("For memory test, only one value of --num_threads is allowed.")
|
||||
|
||||
def run(args):
|
||||
if not torch.cuda.is_available():
|
||||
raise RuntimeError("Please install PyTorch with Cuda, and use a machine with GPU for testing gpu performance.")
|
||||
|
||||
|
|
@ -411,18 +403,63 @@ def main(args):
|
|||
# set random seed manully to get deterministic results
|
||||
#benchmark_helper.set_random_seed(123)
|
||||
|
||||
all_results = test_all(args)
|
||||
# Currently, the longformer attention operator could only run in GPU (no CPU implementation yet).
|
||||
device = torch.device('cuda:0')
|
||||
|
||||
time_stamp = datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||
csv_filename = f"benchmark_detail_{time_stamp}.csv"
|
||||
output_details(all_results, csv_filename)
|
||||
if args.memory:
|
||||
return test_memory(args, device)
|
||||
else:
|
||||
return test_latency(args, device)
|
||||
|
||||
csv_filename = f"benchmark_summary_{time_stamp}.csv"
|
||||
output_summary(all_results, csv_filename, args)
|
||||
|
||||
def test_all():
|
||||
results = []
|
||||
test_times = 100
|
||||
sequence_lengths = [512, 1024, 2048, 4096]
|
||||
for model_name in ['longformer-base-4096']:
|
||||
for batch_size in [1]:
|
||||
for sequence_length in sequence_lengths:
|
||||
for global_length in [8]:
|
||||
engine_name = 'torch'
|
||||
args = parse_arguments(
|
||||
f"-e {engine_name} -t {test_times} -b {batch_size} -s {sequence_length} -g {global_length} -t {test_times} -m {model_name}"
|
||||
.split(' '))
|
||||
results += run(args)
|
||||
|
||||
engine_name = 'onnxruntime'
|
||||
onnx_paths = [f"{model_name}_fp32.onnx", f"{model_name}_fp16.onnx"] # optimized models
|
||||
for onnx_path in onnx_paths:
|
||||
if os.path.exists(onnx_path):
|
||||
for compact_memory in ["0", "1"]:
|
||||
os.environ["ORT_LONGFORMER_COMPACT_MEMORY"] = compact_memory
|
||||
print("ORT_LONGFORMER_COMPACT_MEMORY=", compact_memory)
|
||||
|
||||
args = parse_arguments(
|
||||
f"--disable_io_binding -e {engine_name} --onnx {onnx_path} -t {test_times} -b {batch_size} -s {sequence_length} -g {global_length} -t 10 -m {model_name} --memory"
|
||||
.split(' '))
|
||||
memory_results = run(args)
|
||||
print(memory_results)
|
||||
|
||||
args = parse_arguments(
|
||||
f"--disable_io_binding -e {engine_name} --onnx {onnx_path} -t {test_times} -b {batch_size} -s {sequence_length} -g {global_length} -t {test_times} -m {model_name} --validate_onnx"
|
||||
.split(' '))
|
||||
latency_results = run(args)
|
||||
if len(latency_results) == 1:
|
||||
latency_results[0]["memory"] = memory_results["memory"]
|
||||
|
||||
print(latency_results)
|
||||
|
||||
results += latency_results
|
||||
return results
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_arguments()
|
||||
#args = parse_arguments("-e onnxruntime -t 1 -b 1 -s 4 -g 2 --onnx_dir . -t 1 -m longformer-random-tiny".split(' '))
|
||||
if len(sys.argv) > 1:
|
||||
args = parse_arguments()
|
||||
results = run(args)
|
||||
else:
|
||||
results = test_all()
|
||||
|
||||
main(args)
|
||||
time_stamp = datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||
csv_filename = f"benchmark_detail_{time_stamp}.csv"
|
||||
output_details(results, csv_filename)
|
||||
|
|
|
|||
|
|
@ -22,7 +22,7 @@ import transformers
|
|||
from torch.onnx import register_custom_op_symbolic
|
||||
from torch.onnx.symbolic_helper import parse_args
|
||||
from packaging import version
|
||||
|
||||
from pathlib import Path
|
||||
from longformer_helper import LongformerHelper, PRETRAINED_LONGFORMER_MODELS
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue