update benchmark_longformer for default test suite (#6772)

This commit is contained in:
Tianlei Wu 2021-02-22 22:00:59 -08:00 committed by GitHub
parent 8a6f6bc38b
commit 8703e2c778
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 193 additions and 151 deletions

View file

@ -297,10 +297,8 @@ def measure_memory(is_gpu, func):
return None
monitor = MemoryMonitor(False)
if is_gpu:
print(f"GPU memory usage before testing: {monitor.measure_gpu_usage()}")
else:
print(f"Peak CPU memory usage before testing: {monitor.measure_cpu_usage():.2f} MB")
memory_before_test = monitor.measure_gpu_usage() if is_gpu else monitor.measure_cpu_usage()
from concurrent.futures import ThreadPoolExecutor
with ThreadPoolExecutor() as executor:
@ -312,8 +310,15 @@ def measure_memory(is_gpu, func):
finally:
monitor.keep_measuring = False
max_usage = mem_thread.result()
if is_gpu:
print(f"Peak GPU memory usage: {max_usage}")
print(f"GPU memory usage: before={memory_before_test} peak={max_usage}")
if len(memory_before_test) >= 1 and len(max_usage) >= 1:
before = memory_before_test[0]["max_used_MB"]
after = max_usage[0]["max_used_MB"]
return after - before
else:
return None
else:
print(f"Peak CPU memory usage: {max_usage:.2f} MB")
return max_usage
print(f"CPU memory usage: before={memory_before_test:.1f} MB, peak={max_usage:.1f} MB")
return max_usage - memory_before_test

View file

@ -13,17 +13,20 @@
# cd ../torch_extensions
# python setup.py install
# cd ../longformer
# python convert_longformer_to_onnx.py --model longformer-base-4096 --precision fp32 --optimize_onnx
# python convert_longformer_to_onnx.py --model longformer-base-4096 --precision fp16 --optimize_onnx
#
# When there is no parameter, all avaiable tests (memory & latency) will run on the longformer-base-4096 pretrained model.
# python benchmark_longformer.py
#
# Benchmark the latency (Exported onnx model is in the current directory):
# python benchmark_longformer.py --models longformer-base-4096 --batch_sizes 1 --sequence_lengths 512 1024 2048 4096 --global_lengths 8 --onnx_dir . --validate_onnx -t 100
# python benchmark_longformer.py --model longformer-base-4096 --batch_sizes 1 --sequence_lengths 512 1024 2048 4096 --global_lengths 8 --onnx ./longformer-base-4096_fp16.onnx --validate_onnx -t 100
#
# Benchmark GPU peak memory:
# export ORT_LONGFORMER_COMPACT_MEMORY=0
# python benchmark_longformer.py --models longformer-base-4096 --batch_sizes 1 --sequence_lengths 4096 --global_lengths 8 --onnx_dir . --memory -t 10
# python benchmark_longformer.py --model longformer-base-4096 --batch_sizes 1 --sequence_lengths 4096 --global_lengths 8 --onnx_dir . --memory -t 10
# export ORT_LONGFORMER_COMPACT_MEMORY=1
# python benchmark_longformer.py --models longformer-base-4096 --batch_sizes 1 --sequence_lengths 4096 --global_lengths 8 --onnx_dir . --memory -t 10
# By default, compact memory kernel is not enabled since it is slower. You need set an environment variable ORT_LONGFORMER_COMPACT_MEMORY=1 to enable it, which uses less memory in this test.
# python benchmark_longformer.py --model longformer-base-4096 --batch_sizes 1 --sequence_lengths 4096 --global_lengths 8 --onnx_dir . --memory -t 10
# By default, compact memory kernel is not enabled. You need set an environment variable ORT_LONGFORMER_COMPACT_MEMORY=1 to enable it.
import timeit
from datetime import datetime
@ -34,7 +37,6 @@ import sys
import torch
import onnxruntime
import numpy as np
import pprint
import math
from longformer_helper import LongformerHelper, PRETRAINED_LONGFORMER_MODELS
@ -67,12 +69,14 @@ def test_torch_latency(device, model, model_name, batch_sizes, sequence_lengths,
"precision": "fp32",
"io_binding": "",
"model_name": model_name,
"description": model_name + "[torch]",
"inputs": 3,
"threads": num_threads,
"batch_size": batch_size,
"sequence_length": sequence_length,
"global_length": global_length,
"datetime": str(datetime.now()),
"memory": "?",
}
result.update(benchmark_helper.get_latency_result(runtimes, batch_size))
@ -96,11 +100,13 @@ def test_parity(device, model, ort_session, batch_size, sequence_length, global_
if verbose and (math.isnan(max_diff) or max_diff > 0.001):
print("torch last_state:", torch_outputs[0])
print("ort last_state:", ort_outputs[0])
return max_diff
def test_ort_latency(device,
model,
model_name,
description,
ort_session,
batch_sizes,
sequence_lengths,
@ -135,6 +141,7 @@ def test_ort_latency(device,
result_template = {
"model_name": model_name,
"description": description,
"inputs": 3,
"engine": "OnnxRuntime",
"version": onnxruntime.__version__,
@ -147,6 +154,7 @@ def test_ort_latency(device,
"global_length": global_length,
"test_times": test_times,
"datetime": str(datetime.now()),
"memory": "",
}
if not disable_io_binding:
@ -172,12 +180,12 @@ def test_ort_latency(device,
repeat_times=test_times,
batch_size=batch_size)
pprint.pprint(result)
results.append(result)
if validate_onnx:
test_parity(device, model, ort_session, batch_size, sequence_length, global_length, verbose)
max_diff = test_parity(device, model, ort_session, batch_size, sequence_length, global_length,
verbose)
result["description"] += f"(max_diff={max_diff})"
results.append(result)
return results
@ -198,83 +206,120 @@ def test_ort_memory(device, onnx_model_path, batch_size, sequence_length, global
for _ in range(test_times):
ort_outputs = session.run(None, ort_inputs)
benchmark_helper.measure_memory(is_gpu=True, func=inference)
print("Memory test is done")
memory_used = benchmark_helper.measure_memory(is_gpu=True, func=inference)
return {
"onnx_model": onnx_model_path,
"batch_size": batch_size,
"sequence_length": sequence_length,
"global_length": global_length,
"test_times": test_times,
"num_threads": num_threads,
"memory": memory_used
}
def test_all(args):
# Currently, the longformer attention operator could only run in GPU (no CPU implementation yet).
device = torch.device('cuda:0')
def load_torch_model(model_name, device):
torch_model_name_or_dir = PRETRAINED_LONGFORMER_MODELS[
model_name] if model_name in PRETRAINED_LONGFORMER_MODELS else model_name
results = []
for model_name in args.models:
# Here we run an example input
from transformers import LongformerModel
torch_model_name_or_dir = PRETRAINED_LONGFORMER_MODELS[model_name]
model = LongformerModel.from_pretrained(torch_model_name_or_dir) # pretrained model name or directory
model.to(device)
from transformers import LongformerModel
model = LongformerModel.from_pretrained(torch_model_name_or_dir)
model.to(device)
return model
# Search onnx model in the following order: optimized fp16 model, optimized fp32 model, raw model
# TODO: call convert_longformer_to_onnx to export onnx instead.
import os.path
optimized = False
precision = 'fp32'
onnx_model_path = os.path.join(args.onnx_dir, model_name + ".onnx")
optimized_fp32_model = os.path.join(args.onnx_dir, model_name + "_fp32.onnx")
optimized_fp16_model = os.path.join(args.onnx_dir, model_name + "_fp16.onnx")
if os.path.isfile(optimized_fp16_model):
onnx_model_path = optimized_fp16_model
optimized = True
precision = 'fp16'
elif os.path.isfile(optimized_fp32_model):
onnx_model_path = optimized_fp32_model
optimized = True
print("ONNX model path:", onnx_model_path)
for num_threads in args.num_threads:
if "torch" in args.engines:
results += test_torch_latency(device, model, model_name, args.batch_sizes, args.sequence_lengths,
args.global_lengths, args.test_times, num_threads, args.verbose)
def find_onnx_model(model_name, onnx_dir='.'):
# Search onnx model in the following order: optimized fp16 model, optimized fp32 model, raw model
# TODO: call convert_longformer_to_onnx to export onnx instead.
import os.path
onnx_model_path = os.path.join(onnx_dir, model_name + ".onnx")
optimized_fp32_model = os.path.join(onnx_dir, model_name + "_fp32.onnx")
optimized_fp16_model = os.path.join(onnx_dir, model_name + "_fp16.onnx")
if os.path.isfile(optimized_fp16_model):
onnx_model_path = optimized_fp16_model
elif os.path.isfile(optimized_fp32_model):
onnx_model_path = optimized_fp32_model
return onnx_model_path
if "onnxruntime" in args.engines:
if args.memory:
test_ort_memory(device, onnx_model_path, args.batch_sizes[0], args.sequence_lengths[0],
args.global_lengths[0], args.test_times, num_threads)
else: # test latency
session = benchmark_helper.create_onnxruntime_session(onnx_model_path,
use_gpu=True,
enable_all_optimization=True,
num_threads=num_threads)
if session is None:
raise RuntimeError(f"Failed to create ORT sesssion from ONNX file {onnx_model_path}")
results += test_ort_latency(device, model, model_name, session, args.batch_sizes,
args.sequence_lengths, args.global_lengths, args.test_times,
num_threads, optimized, precision, args.validate_onnx,
args.disable_io_binding, args.verbose)
return results
def test_memory(args, device):
if len(args.batch_sizes) > 1:
raise RuntimeError("For memory test, only one batch_size (-b) is allowed.")
if len(args.sequence_lengths) > 1:
raise RuntimeError("For memory test, only one sequence_length (-s) is allowed.")
if len(args.global_lengths) > 1:
raise RuntimeError("For memory test, only one global_length (-g) is allowed.")
model_name = args.model
onnx_model_path = find_onnx_model(model_name) if not args.onnx else args.onnx
torch.cuda.empty_cache()
return test_ort_memory(device, onnx_model_path, args.batch_sizes[0], args.sequence_lengths[0],
args.global_lengths[0], args.test_times, args.num_threads)
def test_ort(args, device):
model_name = args.model
onnx_model_path = find_onnx_model(model_name) if not args.onnx else args.onnx
optimized = onnx_model_path.endswith("_fp16.onnx") or onnx_model_path.endswith("_fp32.onnx")
precision = 'fp32' if not onnx_model_path.endswith("_fp16.onnx") else 'fp16'
model = load_torch_model(model_name, device)
num_threads = args.num_threads
session = benchmark_helper.create_onnxruntime_session(onnx_model_path,
use_gpu=True,
enable_all_optimization=True,
num_threads=num_threads)
if session is None:
raise RuntimeError(f"Failed to create ORT sesssion from ONNX file {onnx_model_path}")
description = onnx_model_path
if (os.environ.get('ORT_LONGFORMER_COMPACT_MEMORY', '0') == "1"):
description += "[compact_memory]"
return test_ort_latency(device, model, model_name, description, session, args.batch_sizes, args.sequence_lengths,
args.global_lengths, args.test_times, num_threads, optimized, precision, args.validate_onnx,
args.disable_io_binding, args.verbose)
def test_torch(args, device):
model = load_torch_model(args.model, device)
return test_torch_latency(device, model, args.model, args.batch_sizes, args.sequence_lengths, args.global_lengths,
args.test_times, args.num_threads, args.verbose)
def test_latency(args, device):
if "onnxruntime" == args.engine:
return test_ort(args, device)
elif "torch" == args.engine:
return test_torch(args, device)
raise RuntimeError("unknown engine " + args.engine)
def parse_arguments(argv=None):
parser = argparse.ArgumentParser()
parser.add_argument("-m",
"--models",
"--model",
required=False,
nargs="+",
type=str,
default=["longformer-base-4096"],
default="longformer-base-4096",
help="Checkpoint directory or pre-trained model names in the list: " +
", ".join(PRETRAINED_LONGFORMER_MODELS.keys()))
parser.add_argument("-e",
"--engines",
"--engine",
required=False,
nargs="+",
type=str,
default=['onnxruntime'],
default='onnxruntime',
choices=['onnxruntime', 'torch'],
help="Engines to benchmark. For large model, recommend to test only one engine at a time.")
help="Engine to benchmark.")
parser.add_argument("-t",
"--test_times",
@ -297,11 +342,7 @@ def parse_arguments(argv=None):
"Sequence lengths. It could have multiple values in latency test. If --export_padding is not used in exporting onnx model, sequence length shall be multiple of window size."
)
parser.add_argument("--onnx_dir",
required=False,
type=str,
default=os.path.join('.', 'onnx_models'),
help="Directory to search onnx models.")
parser.add_argument("--onnx", required=False, type=str, default=None, help="Onnx model path")
parser.add_argument("-g",
"--global_lengths",
@ -310,13 +351,7 @@ def parse_arguments(argv=None):
default=[0],
help="Number of global tokens. It could have multiple values in latency test.")
parser.add_argument("-n",
"--num_threads",
required=False,
nargs="+",
type=int,
default=[0],
help="Threads to use. It could have multiple values in latency test.")
parser.add_argument("-n", "--num_threads", required=False, type=int, default=0, help="Threads to use.")
parser.add_argument("-v",
"--validate_onnx",
@ -331,78 +366,35 @@ def parse_arguments(argv=None):
parser.add_argument("--verbose", required=False, action="store_true", help="Print more information.")
args = parser.parse_args(argv)
return args
def output_summary(results, csv_filename, args):
with open(csv_filename, mode="a", newline='') as csv_file:
header_names = [
"model_name", "inputs", "engine", "version", "device", "precision", "optimizer", "io_binding", "threads"
]
data_names = []
for batch_size in args.batch_sizes:
for sequence_length in args.sequence_lengths:
for global_length in args.global_lengths:
data_names.append(f"b{batch_size}_s{sequence_length}_g{global_length}")
csv_writer = csv.DictWriter(csv_file, fieldnames=header_names + data_names)
csv_writer.writeheader()
for model in args.models:
for input_count in [1, 2, 3]:
for engine_name in args.engines:
for io_binding in [True, False, ""]:
for threads in args.num_threads:
row = {}
for result in results:
if result["model_name"] == model and result["inputs"] == input_count and \
result["engine"] == engine_name and result["io_binding"] == io_binding and \
result["threads"] == threads:
headers = {k: v for k, v in result.items() if k in header_names}
if not row:
row.update(headers)
row.update({k: "" for k in data_names})
else:
for k in header_names:
assert row[k] == headers[k]
b = result["batch_size"]
s = result["sequence_length"]
g = result["global_length"]
row[f"b{b}_s{s}_g{g}"] = result["average_latency_ms"]
if row:
csv_writer.writerow(row)
print(f"Summary results are saved to csv file: {csv_filename}")
def output_details(results, csv_filename):
latency_results = [result for result in results if 'average_latency_ms' in result]
if len(latency_results) == 0:
print("No latency results for output.")
return
with open(csv_filename, mode="a", newline='') as csv_file:
column_names = [
"engine", "version", "device", "precision", "optimizer", "io_binding", "model_name", "inputs", "threads",
"batch_size", "sequence_length", "global_length", "datetime", "test_times", "QPS", "average_latency_ms",
"latency_variance", "latency_90_percentile", "latency_95_percentile", "latency_99_percentile"
"datetime", "test_times", "description", "batch_size", "sequence_length", "global_length", "memory", "QPS",
"average_latency_ms", "latency_variance", "latency_90_percentile", "latency_95_percentile",
"latency_99_percentile"
]
csv_writer = csv.DictWriter(csv_file, fieldnames=column_names)
csv_writer.writeheader()
for result in results:
for result in latency_results:
print(
f"b={result['batch_size']}, s={result['sequence_length']}, g={result['global_length']}, latency={result['average_latency_ms']}ms, memory={result['memory']}MB {result['description']}"
)
csv_writer.writerow(result)
print(f"Detail results are saved to csv file: {csv_filename}")
def main(args):
assert len(args.models) == 1, "run only one model at a time"
if args.memory:
if len(args.batch_sizes) > 1:
raise RuntimeError("For memory test, only one batch_size (-b) is allowed.")
if len(args.sequence_lengths) > 1:
raise RuntimeError("For memory test, only one sequence_length (-s) is allowed.")
if len(args.global_lengths) > 1:
raise RuntimeError("For memory test, only one global_length (-g) is allowed.")
if len(args.num_threads) > 1:
raise RuntimeError("For memory test, only one value of --num_threads is allowed.")
def run(args):
if not torch.cuda.is_available():
raise RuntimeError("Please install PyTorch with Cuda, and use a machine with GPU for testing gpu performance.")
@ -411,18 +403,63 @@ def main(args):
# set random seed manully to get deterministic results
#benchmark_helper.set_random_seed(123)
all_results = test_all(args)
# Currently, the longformer attention operator could only run in GPU (no CPU implementation yet).
device = torch.device('cuda:0')
time_stamp = datetime.now().strftime("%Y%m%d-%H%M%S")
csv_filename = f"benchmark_detail_{time_stamp}.csv"
output_details(all_results, csv_filename)
if args.memory:
return test_memory(args, device)
else:
return test_latency(args, device)
csv_filename = f"benchmark_summary_{time_stamp}.csv"
output_summary(all_results, csv_filename, args)
def test_all():
results = []
test_times = 100
sequence_lengths = [512, 1024, 2048, 4096]
for model_name in ['longformer-base-4096']:
for batch_size in [1]:
for sequence_length in sequence_lengths:
for global_length in [8]:
engine_name = 'torch'
args = parse_arguments(
f"-e {engine_name} -t {test_times} -b {batch_size} -s {sequence_length} -g {global_length} -t {test_times} -m {model_name}"
.split(' '))
results += run(args)
engine_name = 'onnxruntime'
onnx_paths = [f"{model_name}_fp32.onnx", f"{model_name}_fp16.onnx"] # optimized models
for onnx_path in onnx_paths:
if os.path.exists(onnx_path):
for compact_memory in ["0", "1"]:
os.environ["ORT_LONGFORMER_COMPACT_MEMORY"] = compact_memory
print("ORT_LONGFORMER_COMPACT_MEMORY=", compact_memory)
args = parse_arguments(
f"--disable_io_binding -e {engine_name} --onnx {onnx_path} -t {test_times} -b {batch_size} -s {sequence_length} -g {global_length} -t 10 -m {model_name} --memory"
.split(' '))
memory_results = run(args)
print(memory_results)
args = parse_arguments(
f"--disable_io_binding -e {engine_name} --onnx {onnx_path} -t {test_times} -b {batch_size} -s {sequence_length} -g {global_length} -t {test_times} -m {model_name} --validate_onnx"
.split(' '))
latency_results = run(args)
if len(latency_results) == 1:
latency_results[0]["memory"] = memory_results["memory"]
print(latency_results)
results += latency_results
return results
if __name__ == "__main__":
args = parse_arguments()
#args = parse_arguments("-e onnxruntime -t 1 -b 1 -s 4 -g 2 --onnx_dir . -t 1 -m longformer-random-tiny".split(' '))
if len(sys.argv) > 1:
args = parse_arguments()
results = run(args)
else:
results = test_all()
main(args)
time_stamp = datetime.now().strftime("%Y%m%d-%H%M%S")
csv_filename = f"benchmark_detail_{time_stamp}.csv"
output_details(results, csv_filename)

View file

@ -22,7 +22,7 @@ import transformers
from torch.onnx import register_custom_op_symbolic
from torch.onnx.symbolic_helper import parse_args
from packaging import version
from pathlib import Path
from longformer_helper import LongformerHelper, PRETRAINED_LONGFORMER_MODELS