Add benchmark script for segment anything v2 (#22169)

### Description
Add benchmark script segment anything v2. 
It depends on https://github.com/microsoft/onnxruntime/pull/22119 for
onnx export, and https://github.com/microsoft/onnxruntime/pull/22167 for
sam2 graph fusion.

### Motivation and Context

Benchmark SAM2 model performance.
This commit is contained in:
Tianlei Wu 2024-09-20 21:32:37 -07:00 committed by GitHub
parent 1431215dcf
commit 171b901e32
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 569 additions and 0 deletions

View file

@ -0,0 +1,497 @@
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# --------------------------------------------------------------------------
"""
Benchmark performance of SAM2 encoder with ORT or PyTorch. See benchmark_sam2.sh for usage.
"""
import argparse
import csv
import statistics
import time
from datetime import datetime
from typing import List, Mapping
import torch
from image_decoder import SAM2ImageDecoder
from image_encoder import SAM2ImageEncoder
from sam2_utils import decoder_shape_dict, encoder_shape_dict, load_sam2_model
from onnxruntime import InferenceSession, SessionOptions, get_available_providers
from onnxruntime.transformers.io_binding_helper import CudaSession
class TestConfig:
def __init__(
self,
model_type: str,
onnx_path: str,
sam2_dir: str,
device: torch.device,
component: str = "image_encoder",
provider="CPUExecutionProvider",
torch_compile_mode="max-autotune",
batch_size: int = 1,
height: int = 1024,
width: int = 1024,
num_labels: int = 1,
num_points: int = 1,
num_masks: int = 1,
multi_mask_output: bool = False,
use_tf32: bool = True,
enable_cuda_graph: bool = False,
dtype=torch.float32,
prefer_nhwc: bool = False,
warm_up: int = 5,
repeats: int = 1000,
verbose: bool = False,
):
assert model_type in ["sam2_hiera_tiny", "sam2_hiera_small", "sam2_hiera_large", "sam2_hiera_base_plus"]
assert height >= 160 and height <= 4096
assert width >= 160 and width <= 4096
self.model_type = model_type
self.onnx_path = onnx_path
self.sam2_dir = sam2_dir
self.component = component
self.provider = provider
self.torch_compile_mode = torch_compile_mode
self.batch_size = batch_size
self.height = height
self.width = width
self.num_labels = num_labels
self.num_points = num_points
self.num_masks = num_masks
self.multi_mask_output = multi_mask_output
self.device = device
self.use_tf32 = use_tf32
self.enable_cuda_graph = enable_cuda_graph
self.dtype = dtype
self.prefer_nhwc = prefer_nhwc
self.warm_up = 5
self.repeats = repeats
self.verbose = verbose
if self.component == "image_encoder":
assert self.height == 1024 and self.width == 1024, "Only image size 1024x1024 is allowed for image encoder."
def __repr__(self):
return f"{vars(self)}"
def shape_dict(self) -> Mapping[str, List[int]]:
if self.component == "image_encoder":
return encoder_shape_dict(self.batch_size, self.height, self.width)
else:
return decoder_shape_dict(self.height, self.width, self.num_labels, self.num_points, self.num_masks)
def random_inputs(self):
if self.component == "image_encoder":
return {
"image": torch.randn(
self.batch_size, 3, self.height, self.width, dtype=torch.float32, device=self.device
)
}
else:
return {
"image_features_0": torch.rand(1, 32, 256, 256, dtype=torch.float32, device=self.device),
"image_features_1": torch.rand(1, 64, 128, 128, dtype=torch.float32, device=self.device),
"image_embeddings": torch.rand(1, 256, 64, 64, dtype=torch.float32, device=self.device),
"point_coords": torch.randint(
0, 1024, (self.num_labels, self.num_points, 2), dtype=torch.float32, device=self.device
),
"point_labels": torch.randint(
0, 1, (self.num_labels, self.num_points), dtype=torch.int32, device=self.device
),
"input_masks": torch.zeros(self.num_labels, 1, 256, 256, dtype=torch.float32, device=self.device),
"has_input_masks": torch.ones(self.num_labels, dtype=torch.float32, device=self.device),
"original_image_size": torch.tensor([self.height, self.width], dtype=torch.int32, device=self.device),
}
def create_ort_session(config: TestConfig, session_options=None) -> InferenceSession:
if config.verbose:
print(f"create session for {vars(config)}")
if config.provider == "CUDAExecutionProvider":
device_id = torch.cuda.current_device() if isinstance(config.device, str) else config.device.index
provider_options = CudaSession.get_cuda_provider_options(device_id, config.enable_cuda_graph)
provider_options["use_tf32"] = int(config.use_tf32)
if config.prefer_nhwc:
provider_options["prefer_nhwc"] = 1
providers = [(config.provider, provider_options), "CPUExecutionProvider"]
else:
providers = ["CPUExecutionProvider"]
ort_session = InferenceSession(config.onnx_path, session_options, providers=providers)
return ort_session
def create_session(config: TestConfig, session_options=None) -> CudaSession:
ort_session = create_ort_session(config, session_options)
cuda_session = CudaSession(ort_session, config.device, config.enable_cuda_graph)
cuda_session.allocate_buffers(config.shape_dict())
return cuda_session
class OrtTestSession:
"""A wrapper of ORT session to test relevance and performance."""
def __init__(self, config: TestConfig, session_options=None):
self.ort_session = create_session(config, session_options)
self.feed_dict = config.random_inputs()
def infer(self):
return self.ort_session.infer(self.feed_dict)
def measure_latency(cuda_session: CudaSession, input_dict):
start = time.time()
_ = cuda_session.infer(input_dict)
end = time.time()
return end - start
def run_torch(config: TestConfig):
device_type = config.device.type
is_cuda = device_type == "cuda"
# Turn on TF32 for Ampere GPUs which could help when data type is float32.
if is_cuda and torch.cuda.get_device_properties(0).major >= 8 and config.use_tf32:
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
enabled_auto_cast = is_cuda and config.dtype != torch.float32
ort_inputs = config.random_inputs()
with torch.inference_mode(), torch.autocast(device_type=device_type, dtype=config.dtype, enabled=enabled_auto_cast):
sam2_model = load_sam2_model(config.sam2_dir, config.model_type, device=config.device)
if config.component == "image_encoder":
if is_cuda:
sam2_model.image_encoder.forward = torch.compile(
sam2_model.image_encoder.forward,
mode=config.torch_compile_mode, # "reduce-overhead" if you want to reduce latency of first run.
fullgraph=True,
dynamic=False,
)
image_shape = config.shape_dict()["image"]
img = torch.randn(image_shape).to(device=config.device, dtype=config.dtype)
sam2_encoder = SAM2ImageEncoder(sam2_model)
if is_cuda:
print(f"Running warm up. It will take a while since torch compile mode is {config.torch_compile_mode}.")
for _ in range(config.warm_up):
_image_features_0, _image_features_1, _image_embeddings = sam2_encoder(img)
print(f"Start {config.repeats} runs of performance tests...")
start = time.time()
for _ in range(config.repeats):
_image_features_0, _image_features_1, _image_embeddings = sam2_encoder(img)
if is_cuda:
torch.cuda.synchronize()
else:
torch_inputs = (
ort_inputs["image_features_0"],
ort_inputs["image_features_1"],
ort_inputs["image_embeddings"],
ort_inputs["point_coords"],
ort_inputs["point_labels"],
ort_inputs["input_masks"],
ort_inputs["has_input_masks"],
ort_inputs["original_image_size"],
)
sam2_decoder = SAM2ImageDecoder(sam2_model, multimask_output=config.multi_mask_output)
# warm up
for _ in range(3):
_masks, _iou_predictions, _low_res_masks = sam2_decoder(*torch_inputs)
print(f"Start {config.repeats} runs of performance tests...")
start = time.time()
for _ in range(config.repeats):
_masks, _iou_predictions, _low_res_masks = sam2_decoder(*torch_inputs)
if is_cuda:
torch.cuda.synchronize()
end = time.time()
return (end - start) / config.repeats
def run_test(
csv_writer: csv.DictWriter,
args: argparse.Namespace,
):
use_gpu: bool = args.use_gpu
enable_cuda_graph: bool = args.use_cuda_graph
repeats: int = args.repeats
if use_gpu:
device_id = torch.cuda.current_device()
device = torch.device("cuda", device_id)
provider = "CUDAExecutionProvider"
else:
device_id = 0
device = torch.device("cpu")
enable_cuda_graph = False
provider = "CPUExecutionProvider"
dtypes = {"fp32": torch.float32, "fp16": torch.float16, "bf16": torch.bfloat16}
config = TestConfig(
model_type=args.model_type,
onnx_path=args.onnx_path,
sam2_dir=args.sam2_dir,
component=args.component,
provider=provider,
batch_size=args.batch_size,
height=args.height,
width=args.width,
device=device,
use_tf32=True,
enable_cuda_graph=False,
dtype=dtypes[args.dtype],
prefer_nhwc=args.prefer_nhwc,
repeats=args.repeats,
warm_up=args.warm_up,
verbose=False,
)
if args.engine == "ort":
sess_options = SessionOptions()
sess_options.intra_op_num_threads = args.intra_op_num_threads
session = create_session(config, sess_options)
input_dict = config.random_inputs()
# warm up session
try:
for _ in range(config.warm_up):
_ = measure_latency(session, input_dict)
except Exception as e:
print(f"Failed to run {config=}. Exception: {e}")
return
latency_list = []
for _ in range(repeats):
latency = measure_latency(session, input_dict)
latency_list.append(latency)
average_latency = statistics.mean(latency_list)
del session
else: # torch
with torch.no_grad():
try:
average_latency = run_torch(config)
except Exception as e:
print(f"Failed to run {config=}. Exception: {e}")
return
engine = args.engine + ":" + ("cuda" if use_gpu else "cpu")
row = {
"model_type": args.model_type,
"component": args.component,
"dtype": args.dtype,
"use_gpu": use_gpu,
"enable_cuda_graph": enable_cuda_graph,
"prefer_nhwc": config.prefer_nhwc,
"use_tf32": config.use_tf32,
"batch_size": args.batch_size,
"height": args.height,
"width": args.width,
"multi_mask_output": args.multimask_output,
"num_labels": config.num_labels,
"num_points": config.num_points,
"num_masks": config.num_masks,
"intra_op_num_threads": args.intra_op_num_threads,
"engine": engine,
"warm_up": config.warm_up,
"repeats": repeats,
"average_latency": average_latency,
}
csv_writer.writerow(row)
print(f"{vars(config)}")
print(f"{row}")
def run_tests(args):
features = "gpu" if args.use_gpu else "cpu"
csv_filename = "benchmark_sam_{}_{}_{}.csv".format(
features,
args.engine,
datetime.now().strftime("%Y%m%d-%H%M%S"),
)
with open(csv_filename, mode="a", newline="") as csv_file:
column_names = [
"model_type",
"component",
"dtype",
"use_gpu",
"enable_cuda_graph",
"prefer_nhwc",
"use_tf32",
"batch_size",
"height",
"width",
"multi_mask_output",
"num_labels",
"num_points",
"num_masks",
"intra_op_num_threads",
"engine",
"warm_up",
"repeats",
"average_latency",
]
csv_writer = csv.DictWriter(csv_file, fieldnames=column_names)
csv_writer.writeheader()
run_test(csv_writer, args)
def _parse_arguments():
parser = argparse.ArgumentParser(description="Benchmark SMA2 for ONNX Runtime and PyTorch.")
parser.add_argument(
"--component",
required=False,
choices=["image_encoder", "image_decoder"],
default="image_encoder",
help="component to benchmark. Choices are image_encoder and image_decoder.",
)
parser.add_argument(
"--dtype", required=False, choices=["fp32", "fp16", "bf16"], default="fp32", help="Data type for inference."
)
parser.add_argument(
"--use_gpu",
required=False,
action="store_true",
help="Use GPU for inference.",
)
parser.set_defaults(use_gpu=False)
parser.add_argument(
"--use_cuda_graph",
required=False,
action="store_true",
help="Use cuda graph in onnxruntime.",
)
parser.set_defaults(use_cuda_graph=False)
parser.add_argument(
"--intra_op_num_threads",
required=False,
type=int,
choices=[0, 1, 2, 4, 8, 16],
default=0,
help="intra_op_num_threads for onnxruntime. ",
)
parser.add_argument(
"--batch_size",
required=False,
type=int,
default=1,
help="batch size",
)
parser.add_argument(
"--height",
required=False,
type=int,
default=1024,
help="image height",
)
parser.add_argument(
"--width",
required=False,
type=int,
default=1024,
help="image width",
)
parser.add_argument(
"--repeats",
required=False,
type=int,
default=1000,
help="number of repeats for performance test. Default is 1000.",
)
parser.add_argument(
"--warm_up",
required=False,
type=int,
default=5,
help="number of runs for warm up. Default is 5.",
)
parser.add_argument(
"--engine",
required=False,
type=str,
default="ort",
choices=["ort", "torch"],
help="engine for inference",
)
parser.add_argument(
"--multimask_output",
required=False,
default=False,
action="store_true",
help="Export mask_decoder or image_decoder with multimask_output",
)
parser.add_argument(
"--prefer_nhwc",
required=False,
default=False,
action="store_true",
help="Use prefer_nhwc=1 provider option for CUDAExecutionProvider",
)
parser.add_argument(
"--model_type",
required=False,
type=str,
default="sam2_hiera_large",
choices=["sam2_hiera_tiny", "sam2_hiera_small", "sam2_hiera_large", "sam2_hiera_base_plus"],
help="sam2 model name",
)
parser.add_argument(
"--sam2_dir",
required=False,
type=str,
default="./segment-anything-2",
help="The directory of segment-anything-2 git root directory",
)
parser.add_argument(
"--onnx_path",
required=False,
type=str,
default="./sam2_onnx_models/sam2_hiera_large_image_encoder.onnx",
help="path of onnx model",
)
args = parser.parse_args()
return args
if __name__ == "__main__":
args = _parse_arguments()
print(f"arguments:{args}")
if args.use_gpu:
assert torch.cuda.is_available()
if args.engine == "ort":
assert "CUDAExecutionProvider" in get_available_providers()
run_tests(args)

View file

@ -0,0 +1,72 @@
#!/bin/sh
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# --------------------------------------------------------------------------
# Directory of the script
dir="$( cd "$( dirname "$0" )" && pwd )"
# Directory of the onnx models
onnx_dir=$dir/sam2_onnx_models
# Directory of the sam2 code by "git clone https://github.com/facebookresearch/segment-anything-2"
sam2_dir=~/segment-anything-2
# model name to benchmark
model=sam2_hiera_large
run_cpu()
{
repeats=$1
python3 convert_to_onnx.py --sam2_dir $sam2_dir --optimize --demo
echo "Benchmarking SAM2 model $model image encoder for PyTorch ..."
python3 benchmark_sam2.py --model_type $model --engine torch --sam2_dir $sam2_dir --repeats $repeats --dtype fp32
python3 benchmark_sam2.py --model_type $model --engine torch --sam2_dir $sam2_dir --repeats $repeats --dtype fp16
echo "Benchmarking SAM2 model $model image encoder for PyTorch ..."
python3 benchmark_sam2.py --model_type $model --engine torch --sam2_dir $sam2_dir --repeats $repeats --dtype fp32 --component image_decoder
python3 benchmark_sam2.py --model_type $model --engine torch --sam2_dir $sam2_dir --repeats $repeats --dtype fp16 --component image_decoder
echo "Benchmarking SAM2 model $model image encoder for ORT ..."
python3 benchmark_sam2.py --model_type $model --engine ort --sam2_dir $sam2_dir --repeats $repeats --onnx_path ${onnx_dir}/${model}_image_encoder.onnx --dtype fp32
python3 benchmark_sam2.py --model_type $model --engine ort --sam2_dir $sam2_dir --repeats $repeats --onnx_path ${onnx_dir}/${model}_image_encoder_fp32_cpu.onnx --dtype fp32
echo "Benchmarking SAM2 model $model image decoder for ORT ..."
python3 benchmark_sam2.py --model_type $model --engine ort --sam2_dir $sam2_dir --repeats $repeats --onnx_path ${onnx_dir}/${model}_image_decoder.onnx --component image_decoder
python3 benchmark_sam2.py --model_type $model --engine ort --sam2_dir $sam2_dir --repeats $repeats --onnx_path ${onnx_dir}/${model}_image_decoder_fp32_cpu.onnx --component image_decoder
}
run_gpu()
{
repeats=$1
python3 convert_to_onnx.py --sam2_dir $sam2_dir --optimize --use_gpu --dtype fp32
python3 convert_to_onnx.py --sam2_dir $sam2_dir --optimize --use_gpu --dtype fp16 --demo
echo "Benchmarking SAM2 model $model image encoder for PyTorch ..."
python3 benchmark_sam2.py --model_type $model --engine torch --sam2_dir $sam2_dir --repeats $repeats --use_gpu --dtype fp16
python3 benchmark_sam2.py --model_type $model --engine torch --sam2_dir $sam2_dir --repeats $repeats --use_gpu --dtype bf16
python3 benchmark_sam2.py --model_type $model --engine torch --sam2_dir $sam2_dir --repeats $repeats --use_gpu --dtype fp32
echo "Benchmarking SAM2 model $model image decoder for PyTorch ..."
python3 benchmark_sam2.py --model_type $model --engine torch --sam2_dir $sam2_dir --repeats $repeats --use_gpu --dtype fp16 --component image_decoder
python3 benchmark_sam2.py --model_type $model --engine torch --sam2_dir $sam2_dir --repeats $repeats --use_gpu --dtype bf16 --component image_decoder
python3 benchmark_sam2.py --model_type $model --engine torch --sam2_dir $sam2_dir --repeats $repeats --use_gpu --dtype fp32 --component image_decoder
echo "Benchmarking SAM2 model $model image encoder for ORT ..."
python3 benchmark_sam2.py --model_type $model --engine ort --sam2_dir $sam2_dir --repeats $repeats --onnx_path ${onnx_dir}/${model}_image_encoder_fp16_gpu.onnx --use_gpu --dtype fp16
python3 benchmark_sam2.py --model_type $model --engine ort --sam2_dir $sam2_dir --repeats $repeats --onnx_path ${onnx_dir}/${model}_image_encoder_fp32_gpu.onnx --use_gpu --dtype fp32
echo "Benchmarking SAM2 model $model image decoder for ORT ..."
python3 benchmark_sam2.py --model_type $model --engine ort --sam2_dir $sam2_dir --repeats $repeats --onnx_path ${onnx_dir}/${model}_image_decoder_fp16_gpu.onnx --component image_decoder --use_gpu --dtype fp16
python3 benchmark_sam2.py --model_type $model --engine ort --sam2_dir $sam2_dir --repeats $repeats --onnx_path ${onnx_dir}/${model}_image_decoder_fp32_gpu.onnx --component image_decoder --use_gpu
}
if python3 -c "import torch; assert torch.cuda.is_available()" 2>/dev/null; then
run_gpu 1000
else
run_cpu 100
fi