onnxruntime/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py
Tianlei Wu 09c98433e7
[CUDA] stable diffusion benchmark allows IO binding for optimum (#22834)
### Description

Update stable diffusion benchmark:
(1) allow IO binding for optimum.
(2) do not use num_images_per_prompt across all engines for fair
comparison.

Example to run benchmark of optimum on stable diffusion 1.5:
```
git clone https://github.com/tianleiwu/optimum
cd optimum
git checkout tlwu/diffusers-io-binding
pip install -e .

pip install -U onnxruntime-gpu
git clone https://github.com/microsoft/onnxruntime
cd onnxruntime/onnxruntime/python/tools/transformers/models/stable_diffusion
git checkout tlwu/benchmark_sd_optimum_io_binding
pip install -r requirements/cuda12/requirements.txt

optimum-cli export onnx --model runwayml/stable-diffusion-v1-5  --task text-to-image ./sd_onnx_fp32

python optimize_pipeline.py -i ./sd_onnx_fp32 -o ./sd_onnx_fp16 --float16
python benchmark.py -e optimum -r cuda -v 1.5 -p ./sd_onnx_fp16
python benchmark.py -e optimum -r cuda -v 1.5 -p ./sd_onnx_fp16 --use_io_binding
```

Example output in H100_80GB_HBM3: 572 ms with IO Binding; 588 ms without
IO Binding; IO binding gains 16ms, or 2.7%,

### Motivation and Context

Optimum is working on enabling I/O binding:
https://github.com/huggingface/optimum/pull/2056. This could help
testing the impact of I/O binding on the performance of the stable
diffusion.
2024-11-14 00:09:07 -08:00

1466 lines
46 KiB
Python
Executable file

# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# --------------------------------------------------------------------------
import argparse
import csv
import os
import statistics
import sys
import time
import __init__ # noqa: F401. Walk-around to run this script directly
import coloredlogs
# import torch before onnxruntime so that onnxruntime uses the cuDNN in the torch package.
import torch
from benchmark_helper import measure_memory
SD_MODELS = {
"1.5": "runwayml/stable-diffusion-v1-5",
"2.0": "stabilityai/stable-diffusion-2",
"2.1": "stabilityai/stable-diffusion-2-1",
"xl-1.0": "stabilityai/stable-diffusion-xl-refiner-1.0",
}
PROVIDERS = {
"cuda": "CUDAExecutionProvider",
"rocm": "ROCMExecutionProvider",
"migraphx": "MIGraphXExecutionProvider",
"tensorrt": "TensorrtExecutionProvider",
}
def example_prompts():
prompts = [
"a photo of an astronaut riding a horse on mars",
"cute grey cat with blue eyes, wearing a bowtie, acrylic painting",
"a cute magical flying dog, fantasy art drawn by disney concept artists, highly detailed, digital painting",
"an illustration of a house with large barn with many cute flower pots and beautiful blue sky scenery",
"one apple sitting on a table, still life, reflective, full color photograph, centered, close-up product",
"background texture of stones, masterpiece, artistic, stunning photo, award winner photo",
"new international organic style house, tropical surroundings, architecture, 8k, hdr",
"beautiful Renaissance Revival Estate, Hobbit-House, detailed painting, warm colors, 8k, trending on Artstation",
"blue owl, big green eyes, portrait, intricate metal design, unreal engine, octane render, realistic",
"delicate elvish moonstone necklace on a velvet background, symmetrical intricate motifs, leaves, flowers, 8k",
]
negative_prompt = "bad composition, ugly, abnormal, malformed"
return prompts, negative_prompt
def warmup_prompts():
return "warm up", "bad"
def measure_gpu_memory(monitor_type, func, start_memory=None):
return measure_memory(is_gpu=True, func=func, monitor_type=monitor_type, start_memory=start_memory)
def get_ort_pipeline(model_name: str, directory: str, provider, disable_safety_checker: bool):
from diffusers import DDIMScheduler, OnnxStableDiffusionPipeline
import onnxruntime
if directory is not None:
assert os.path.exists(directory)
session_options = onnxruntime.SessionOptions()
pipe = OnnxStableDiffusionPipeline.from_pretrained(
directory,
provider=provider,
sess_options=session_options,
)
else:
pipe = OnnxStableDiffusionPipeline.from_pretrained(
model_name,
revision="onnx",
provider=provider,
use_auth_token=True,
)
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
pipe.set_progress_bar_config(disable=True)
if disable_safety_checker:
pipe.safety_checker = None
pipe.feature_extractor = None
return pipe
def get_torch_pipeline(model_name: str, disable_safety_checker: bool, enable_torch_compile: bool, use_xformers: bool):
from diffusers import DDIMScheduler, StableDiffusionPipeline
from torch import channels_last, float16
pipe = StableDiffusionPipeline.from_pretrained(model_name, torch_dtype=float16).to("cuda")
pipe.unet.to(memory_format=channels_last) # in-place operation
if use_xformers:
pipe.enable_xformers_memory_efficient_attention()
if enable_torch_compile:
pipe.unet = torch.compile(pipe.unet)
pipe.vae = torch.compile(pipe.vae)
pipe.text_encoder = torch.compile(pipe.text_encoder)
print("Torch compiled unet, vae and text_encoder")
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
pipe.set_progress_bar_config(disable=True)
if disable_safety_checker:
pipe.safety_checker = None
pipe.feature_extractor = None
return pipe
def get_image_filename_prefix(engine: str, model_name: str, batch_size: int, disable_safety_checker: bool):
short_model_name = model_name.split("/")[-1].replace("stable-diffusion-", "sd")
return f"{engine}_{short_model_name}_b{batch_size}" + ("" if disable_safety_checker else "_safe")
def run_ort_pipeline(
pipe,
batch_size: int,
image_filename_prefix: str,
height,
width,
steps,
num_prompts,
batch_count,
start_memory,
memory_monitor_type,
):
from diffusers import OnnxStableDiffusionPipeline
assert isinstance(pipe, OnnxStableDiffusionPipeline)
prompts, negative_prompt = example_prompts()
def warmup():
prompt, negative = warmup_prompts()
pipe(
prompt=[prompt] * batch_size,
height=height,
width=width,
num_inference_steps=steps,
negative_prompt=[negative] * batch_size,
)
# Run warm up, and measure GPU memory of two runs
# cuDNN/MIOpen The first run has algo search so it might need more memory)
first_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
second_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
warmup()
latency_list = []
for i, prompt in enumerate(prompts):
if i >= num_prompts:
break
inference_start = time.time()
images = pipe(
prompt=[prompt] * batch_size,
height=height,
width=width,
num_inference_steps=steps,
negative_prompt=[negative_prompt] * batch_size,
).images
inference_end = time.time()
latency = inference_end - inference_start
latency_list.append(latency)
print(f"Inference took {latency:.3f} seconds")
for k, image in enumerate(images):
image.save(f"{image_filename_prefix}_{i}_{k}.jpg")
from onnxruntime import __version__ as ort_version
return {
"engine": "onnxruntime",
"version": ort_version,
"height": height,
"width": width,
"steps": steps,
"batch_size": batch_size,
"batch_count": batch_count,
"num_prompts": num_prompts,
"average_latency": sum(latency_list) / len(latency_list),
"median_latency": statistics.median(latency_list),
"first_run_memory_MB": first_run_memory,
"second_run_memory_MB": second_run_memory,
}
def run_torch_pipeline(
pipe,
batch_size: int,
image_filename_prefix: str,
height,
width,
steps,
num_prompts,
batch_count,
start_memory,
memory_monitor_type,
):
prompts, negative_prompt = example_prompts()
# total 2 runs of warm up, and measure GPU memory for CUDA EP
def warmup():
prompt, negative = warmup_prompts()
pipe(
prompt=[prompt] * batch_size,
height=height,
width=width,
num_inference_steps=steps,
negative_prompt=[negative] * batch_size,
)
# Run warm up, and measure GPU memory of two runs (The first run has cuDNN algo search so it might need more memory)
first_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
second_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
warmup()
torch.set_grad_enabled(False)
latency_list = []
for i, prompt in enumerate(prompts):
if i >= num_prompts:
break
torch.cuda.synchronize()
inference_start = time.time()
images = pipe(
prompt=[prompt] * batch_size,
height=height,
width=width,
num_inference_steps=steps,
negative_prompt=[negative_prompt] * batch_size,
generator=None, # torch.Generator
).images
torch.cuda.synchronize()
inference_end = time.time()
latency = inference_end - inference_start
latency_list.append(latency)
print(f"Inference took {latency:.3f} seconds")
for k, image in enumerate(images):
image.save(f"{image_filename_prefix}_{i}_{k}.jpg")
return {
"engine": "torch",
"version": torch.__version__,
"height": height,
"width": width,
"steps": steps,
"batch_size": batch_size,
"batch_count": batch_count,
"num_prompts": num_prompts,
"average_latency": sum(latency_list) / len(latency_list),
"median_latency": statistics.median(latency_list),
"first_run_memory_MB": first_run_memory,
"second_run_memory_MB": second_run_memory,
}
def run_ort(
model_name: str,
directory: str,
provider: str,
batch_size: int,
disable_safety_checker: bool,
height: int,
width: int,
steps: int,
num_prompts: int,
batch_count: int,
start_memory,
memory_monitor_type,
tuning: bool,
):
provider_and_options = provider
if tuning and provider in ["CUDAExecutionProvider", "ROCMExecutionProvider"]:
provider_and_options = (provider, {"tunable_op_enable": 1, "tunable_op_tuning_enable": 1})
load_start = time.time()
pipe = get_ort_pipeline(model_name, directory, provider_and_options, disable_safety_checker)
load_end = time.time()
print(f"Model loading took {load_end - load_start} seconds")
image_filename_prefix = get_image_filename_prefix("ort", model_name, batch_size, disable_safety_checker)
result = run_ort_pipeline(
pipe,
batch_size,
image_filename_prefix,
height,
width,
steps,
num_prompts,
batch_count,
start_memory,
memory_monitor_type,
)
result.update(
{
"model_name": model_name,
"directory": directory,
"provider": provider.replace("ExecutionProvider", ""),
"disable_safety_checker": disable_safety_checker,
"enable_cuda_graph": False,
}
)
return result
def get_optimum_ort_pipeline(
model_name: str,
directory: str,
provider="CUDAExecutionProvider",
disable_safety_checker: bool = True,
use_io_binding: bool = False,
):
from optimum.onnxruntime import ORTStableDiffusionPipeline, ORTStableDiffusionXLPipeline
if directory is not None and os.path.exists(directory):
if "xl" in model_name:
pipeline = ORTStableDiffusionXLPipeline.from_pretrained(
directory,
provider=provider,
session_options=None,
use_io_binding=False, # Not supported by Optimum version 1.17.1 at the time of verification.
)
else:
pipeline = ORTStableDiffusionPipeline.from_pretrained(
directory,
provider=provider,
use_io_binding=use_io_binding,
)
elif "xl" in model_name:
pipeline = ORTStableDiffusionXLPipeline.from_pretrained(
model_name,
export=True,
provider=provider,
session_options=None,
use_io_binding=False, # Not supported by Optimum version 1.17.1 at the time of verification.
)
pipeline.save_pretrained(directory)
else:
pipeline = ORTStableDiffusionPipeline.from_pretrained(
model_name,
export=True,
provider=provider,
use_io_binding=use_io_binding,
)
pipeline.save_pretrained(directory)
if disable_safety_checker:
pipeline.safety_checker = None
pipeline.feature_extractor = None
return pipeline
def run_optimum_ort_pipeline(
pipe,
batch_size: int,
image_filename_prefix: str,
height,
width,
steps,
num_prompts,
batch_count,
start_memory,
memory_monitor_type,
use_num_images_per_prompt=False,
):
from optimum.onnxruntime import ORTStableDiffusionPipeline, ORTStableDiffusionXLPipeline
assert isinstance(pipe, (ORTStableDiffusionPipeline, ORTStableDiffusionXLPipeline))
prompts, negative_prompt = example_prompts()
def warmup():
prompt, negative = warmup_prompts()
if use_num_images_per_prompt:
pipe(
prompt=prompt,
height=height,
width=width,
num_inference_steps=steps,
negative_prompt=negative,
num_images_per_prompt=batch_count,
)
else:
pipe(
prompt=[prompt] * batch_size,
height=height,
width=width,
num_inference_steps=steps,
negative_prompt=[negative] * batch_size,
)
# Run warm up, and measure GPU memory of two runs.
# The first run has algo search for cuDNN/MIOpen, so it might need more memory.
first_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
second_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
warmup()
latency_list = []
for i, prompt in enumerate(prompts):
if i >= num_prompts:
break
inference_start = time.time()
if use_num_images_per_prompt:
images = pipe(
prompt=prompt,
height=height,
width=width,
num_inference_steps=steps,
negative_prompt=negative_prompt,
num_images_per_prompt=batch_size,
).images
else:
images = pipe(
prompt=[prompt] * batch_size,
height=height,
width=width,
num_inference_steps=steps,
negative_prompt=[negative_prompt] * batch_size,
).images
inference_end = time.time()
latency = inference_end - inference_start
latency_list.append(latency)
print(f"Inference took {latency:.3f} seconds")
for k, image in enumerate(images):
image.save(f"{image_filename_prefix}_{i}_{k}.jpg")
from onnxruntime import __version__ as ort_version
return {
"engine": "optimum_ort",
"version": ort_version,
"height": height,
"width": width,
"steps": steps,
"batch_size": batch_size,
"batch_count": batch_count,
"num_prompts": num_prompts,
"average_latency": sum(latency_list) / len(latency_list),
"median_latency": statistics.median(latency_list),
"first_run_memory_MB": first_run_memory,
"second_run_memory_MB": second_run_memory,
}
def run_optimum_ort(
model_name: str,
directory: str,
provider: str,
batch_size: int,
disable_safety_checker: bool,
height: int,
width: int,
steps: int,
num_prompts: int,
batch_count: int,
start_memory,
memory_monitor_type,
use_io_binding: bool = False,
):
load_start = time.time()
pipe = get_optimum_ort_pipeline(
model_name, directory, provider, disable_safety_checker, use_io_binding=use_io_binding
)
load_end = time.time()
print(f"Model loading took {load_end - load_start} seconds")
image_filename_prefix = get_image_filename_prefix("optimum", model_name, batch_size, disable_safety_checker)
result = run_optimum_ort_pipeline(
pipe,
batch_size,
image_filename_prefix,
height,
width,
steps,
num_prompts,
batch_count,
start_memory,
memory_monitor_type,
)
result.update(
{
"model_name": model_name,
"directory": directory,
"provider": provider.replace("ExecutionProvider", ""),
"disable_safety_checker": disable_safety_checker,
"enable_cuda_graph": False,
}
)
return result
def run_ort_trt_static(
work_dir: str,
version: str,
batch_size: int,
disable_safety_checker: bool,
height: int,
width: int,
steps: int,
num_prompts: int,
batch_count: int,
start_memory,
memory_monitor_type,
max_batch_size: int,
nvtx_profile: bool = False,
use_cuda_graph: bool = True,
):
print("[I] Initializing ORT TensorRT EP accelerated StableDiffusionXL txt2img pipeline (static input shape)")
# Register TensorRT plugins
from trt_utilities import init_trt_plugins
init_trt_plugins()
assert batch_size <= max_batch_size
from diffusion_models import PipelineInfo
pipeline_info = PipelineInfo(version)
short_name = pipeline_info.short_name()
from engine_builder import EngineType, get_engine_paths
from pipeline_stable_diffusion import StableDiffusionPipeline
engine_type = EngineType.ORT_TRT
onnx_dir, engine_dir, output_dir, framework_model_dir, _ = get_engine_paths(work_dir, pipeline_info, engine_type)
# Initialize pipeline
pipeline = StableDiffusionPipeline(
pipeline_info,
scheduler="DDIM",
output_dir=output_dir,
verbose=False,
nvtx_profile=nvtx_profile,
max_batch_size=max_batch_size,
use_cuda_graph=use_cuda_graph,
framework_model_dir=framework_model_dir,
engine_type=engine_type,
)
# Load TensorRT engines and pytorch modules
pipeline.backend.build_engines(
engine_dir,
framework_model_dir,
onnx_dir,
17,
opt_image_height=height,
opt_image_width=width,
opt_batch_size=batch_size,
static_batch=True,
static_image_shape=True,
max_workspace_size=0,
device_id=torch.cuda.current_device(),
)
# Here we use static batch and image size, so the resource allocation only need done once.
# For dynamic batch and image size, some cost (like memory allocation) shall be included in latency.
pipeline.load_resources(height, width, batch_size)
def warmup():
prompt, negative = warmup_prompts()
pipeline.run([prompt] * batch_size, [negative] * batch_size, height, width, denoising_steps=steps)
# Run warm up, and measure GPU memory of two runs
# The first run has algo search so it might need more memory
first_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
second_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
warmup()
image_filename_prefix = get_image_filename_prefix("ort_trt", short_name, batch_size, disable_safety_checker)
latency_list = []
prompts, negative_prompt = example_prompts()
for i, prompt in enumerate(prompts):
if i >= num_prompts:
break
inference_start = time.time()
# Use warmup mode here since non-warmup mode will save image to disk.
images, pipeline_time = pipeline.run(
[prompt] * batch_size,
[negative_prompt] * batch_size,
height,
width,
denoising_steps=steps,
guidance=7.5,
seed=123,
)
inference_end = time.time()
latency = inference_end - inference_start
latency_list.append(latency)
print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
for k, image in enumerate(images):
image.save(f"{image_filename_prefix}_{i}_{k}.jpg")
pipeline.teardown()
from tensorrt import __version__ as trt_version
from onnxruntime import __version__ as ort_version
return {
"model_name": pipeline_info.name(),
"engine": "onnxruntime",
"version": ort_version,
"provider": f"tensorrt({trt_version})",
"directory": engine_dir,
"height": height,
"width": width,
"steps": steps,
"batch_size": batch_size,
"batch_count": batch_count,
"num_prompts": num_prompts,
"average_latency": sum(latency_list) / len(latency_list),
"median_latency": statistics.median(latency_list),
"first_run_memory_MB": first_run_memory,
"second_run_memory_MB": second_run_memory,
"disable_safety_checker": disable_safety_checker,
"enable_cuda_graph": use_cuda_graph,
}
def run_tensorrt_static(
work_dir: str,
version: str,
model_name: str,
batch_size: int,
disable_safety_checker: bool,
height: int,
width: int,
steps: int,
num_prompts: int,
batch_count: int,
start_memory,
memory_monitor_type,
max_batch_size: int,
nvtx_profile: bool = False,
use_cuda_graph: bool = True,
):
print("[I] Initializing TensorRT accelerated StableDiffusionXL txt2img pipeline (static input shape)")
from cuda import cudart
# Register TensorRT plugins
from trt_utilities import init_trt_plugins
init_trt_plugins()
assert batch_size <= max_batch_size
from diffusion_models import PipelineInfo
pipeline_info = PipelineInfo(version)
from engine_builder import EngineType, get_engine_paths
from pipeline_stable_diffusion import StableDiffusionPipeline
engine_type = EngineType.TRT
onnx_dir, engine_dir, output_dir, framework_model_dir, timing_cache = get_engine_paths(
work_dir, pipeline_info, engine_type
)
# Initialize pipeline
pipeline = StableDiffusionPipeline(
pipeline_info,
scheduler="DDIM",
output_dir=output_dir,
verbose=False,
nvtx_profile=nvtx_profile,
max_batch_size=max_batch_size,
use_cuda_graph=True,
engine_type=engine_type,
)
# Load TensorRT engines and pytorch modules
pipeline.backend.load_engines(
engine_dir=engine_dir,
framework_model_dir=framework_model_dir,
onnx_dir=onnx_dir,
onnx_opset=17,
opt_batch_size=batch_size,
opt_image_height=height,
opt_image_width=width,
static_batch=True,
static_shape=True,
enable_all_tactics=False,
timing_cache=timing_cache,
)
# activate engines
max_device_memory = max(pipeline.backend.max_device_memory(), pipeline.backend.max_device_memory())
_, shared_device_memory = cudart.cudaMalloc(max_device_memory)
pipeline.backend.activate_engines(shared_device_memory)
# Here we use static batch and image size, so the resource allocation only need done once.
# For dynamic batch and image size, some cost (like memory allocation) shall be included in latency.
pipeline.load_resources(height, width, batch_size)
def warmup():
prompt, negative = warmup_prompts()
pipeline.run([prompt] * batch_size, [negative] * batch_size, height, width, denoising_steps=steps)
# Run warm up, and measure GPU memory of two runs
# The first run has algo search so it might need more memory
first_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
second_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
warmup()
image_filename_prefix = get_image_filename_prefix("trt", model_name, batch_size, disable_safety_checker)
latency_list = []
prompts, negative_prompt = example_prompts()
for i, prompt in enumerate(prompts):
if i >= num_prompts:
break
inference_start = time.time()
# Use warmup mode here since non-warmup mode will save image to disk.
images, pipeline_time = pipeline.run(
[prompt] * batch_size,
[negative_prompt] * batch_size,
height,
width,
denoising_steps=steps,
seed=123,
)
inference_end = time.time()
latency = inference_end - inference_start
latency_list.append(latency)
print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
for k, image in enumerate(images):
image.save(f"{image_filename_prefix}_{i}_{k}.jpg")
pipeline.teardown()
import tensorrt as trt
return {
"engine": "tensorrt",
"version": trt.__version__,
"provider": "default",
"height": height,
"width": width,
"steps": steps,
"batch_size": batch_size,
"batch_count": batch_count,
"num_prompts": num_prompts,
"average_latency": sum(latency_list) / len(latency_list),
"median_latency": statistics.median(latency_list),
"first_run_memory_MB": first_run_memory,
"second_run_memory_MB": second_run_memory,
"enable_cuda_graph": use_cuda_graph,
}
def run_tensorrt_static_xl(
work_dir: str,
version: str,
batch_size: int,
disable_safety_checker: bool,
height: int,
width: int,
steps: int,
num_prompts: int,
batch_count: int,
start_memory,
memory_monitor_type,
max_batch_size: int,
nvtx_profile: bool = False,
use_cuda_graph=True,
):
print("[I] Initializing TensorRT accelerated StableDiffusionXL txt2img pipeline (static input shape)")
import tensorrt as trt
from cuda import cudart
from trt_utilities import init_trt_plugins
# Validate image dimensions
image_height = height
image_width = width
if image_height % 8 != 0 or image_width % 8 != 0:
raise ValueError(
f"Image height and width have to be divisible by 8 but specified as: {image_height} and {image_width}."
)
# Register TensorRT plugins
init_trt_plugins()
assert batch_size <= max_batch_size
from diffusion_models import PipelineInfo
from engine_builder import EngineType, get_engine_paths
def init_pipeline(pipeline_class, pipeline_info):
engine_type = EngineType.TRT
onnx_dir, engine_dir, output_dir, framework_model_dir, timing_cache = get_engine_paths(
work_dir, pipeline_info, engine_type
)
# Initialize pipeline
pipeline = pipeline_class(
pipeline_info,
scheduler="DDIM",
output_dir=output_dir,
verbose=False,
nvtx_profile=nvtx_profile,
max_batch_size=max_batch_size,
use_cuda_graph=use_cuda_graph,
framework_model_dir=framework_model_dir,
engine_type=engine_type,
)
pipeline.backend.load_engines(
engine_dir=engine_dir,
framework_model_dir=framework_model_dir,
onnx_dir=onnx_dir,
onnx_opset=17,
opt_batch_size=batch_size,
opt_image_height=height,
opt_image_width=width,
static_batch=True,
static_shape=True,
enable_all_tactics=False,
timing_cache=timing_cache,
)
return pipeline
from pipeline_stable_diffusion import StableDiffusionPipeline
pipeline_info = PipelineInfo(version)
pipeline = init_pipeline(StableDiffusionPipeline, pipeline_info)
max_device_memory = max(pipeline.backend.max_device_memory(), pipeline.backend.max_device_memory())
_, shared_device_memory = cudart.cudaMalloc(max_device_memory)
pipeline.backend.activate_engines(shared_device_memory)
# Here we use static batch and image size, so the resource allocation only need done once.
# For dynamic batch and image size, some cost (like memory allocation) shall be included in latency.
pipeline.load_resources(image_height, image_width, batch_size)
def run_sd_xl_inference(prompt, negative_prompt, seed=None):
return pipeline.run(
prompt,
negative_prompt,
image_height,
image_width,
denoising_steps=steps,
guidance=5.0,
seed=seed,
)
def warmup():
prompt, negative = warmup_prompts()
run_sd_xl_inference([prompt] * batch_size, [negative] * batch_size)
# Run warm up, and measure GPU memory of two runs
# The first run has algo search so it might need more memory
first_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
second_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
warmup()
model_name = pipeline_info.name()
image_filename_prefix = get_image_filename_prefix("trt", model_name, batch_size, disable_safety_checker)
latency_list = []
prompts, negative_prompt = example_prompts()
for i, prompt in enumerate(prompts):
if i >= num_prompts:
break
inference_start = time.time()
# Use warmup mode here since non-warmup mode will save image to disk.
images, pipeline_time = run_sd_xl_inference([prompt] * batch_size, [negative_prompt] * batch_size, seed=123)
inference_end = time.time()
latency = inference_end - inference_start
latency_list.append(latency)
print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
for k, image in enumerate(images):
image.save(f"{image_filename_prefix}_{i}_{k}.png")
pipeline.teardown()
return {
"model_name": model_name,
"engine": "tensorrt",
"version": trt.__version__,
"provider": "default",
"height": height,
"width": width,
"steps": steps,
"batch_size": batch_size,
"batch_count": batch_count,
"num_prompts": num_prompts,
"average_latency": sum(latency_list) / len(latency_list),
"median_latency": statistics.median(latency_list),
"first_run_memory_MB": first_run_memory,
"second_run_memory_MB": second_run_memory,
"enable_cuda_graph": use_cuda_graph,
}
def run_ort_trt_xl(
work_dir: str,
version: str,
batch_size: int,
disable_safety_checker: bool,
height: int,
width: int,
steps: int,
num_prompts: int,
batch_count: int,
start_memory,
memory_monitor_type,
max_batch_size: int,
nvtx_profile: bool = False,
use_cuda_graph=True,
):
from demo_utils import initialize_pipeline
from engine_builder import EngineType
pipeline = initialize_pipeline(
version=version,
engine_type=EngineType.ORT_TRT,
work_dir=work_dir,
height=height,
width=width,
use_cuda_graph=use_cuda_graph,
max_batch_size=max_batch_size,
opt_batch_size=batch_size,
)
assert batch_size <= max_batch_size
pipeline.load_resources(height, width, batch_size)
def run_sd_xl_inference(prompt, negative_prompt, seed=None):
return pipeline.run(
prompt,
negative_prompt,
height,
width,
denoising_steps=steps,
guidance=5.0,
seed=seed,
)
def warmup():
prompt, negative = warmup_prompts()
run_sd_xl_inference([prompt] * batch_size, [negative] * batch_size)
# Run warm up, and measure GPU memory of two runs
# The first run has algo search so it might need more memory
first_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
second_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
warmup()
model_name = pipeline.pipeline_info.name()
image_filename_prefix = get_image_filename_prefix("ort_trt", model_name, batch_size, disable_safety_checker)
latency_list = []
prompts, negative_prompt = example_prompts()
for i, prompt in enumerate(prompts):
if i >= num_prompts:
break
inference_start = time.time()
# Use warmup mode here since non-warmup mode will save image to disk.
images, pipeline_time = run_sd_xl_inference([prompt] * batch_size, [negative_prompt] * batch_size, seed=123)
inference_end = time.time()
latency = inference_end - inference_start
latency_list.append(latency)
print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
for k, image in enumerate(images):
filename = f"{image_filename_prefix}_{i}_{k}.png"
image.save(filename)
print("Image saved to", filename)
pipeline.teardown()
from tensorrt import __version__ as trt_version
from onnxruntime import __version__ as ort_version
return {
"model_name": model_name,
"engine": "onnxruntime",
"version": ort_version,
"provider": f"tensorrt{trt_version})",
"height": height,
"width": width,
"steps": steps,
"batch_size": batch_size,
"batch_count": batch_count,
"num_prompts": num_prompts,
"average_latency": sum(latency_list) / len(latency_list),
"median_latency": statistics.median(latency_list),
"first_run_memory_MB": first_run_memory,
"second_run_memory_MB": second_run_memory,
"enable_cuda_graph": use_cuda_graph,
}
def run_torch(
model_name: str,
batch_size: int,
disable_safety_checker: bool,
enable_torch_compile: bool,
use_xformers: bool,
height: int,
width: int,
steps: int,
num_prompts: int,
batch_count: int,
start_memory,
memory_monitor_type,
):
torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = True
torch.set_grad_enabled(False)
load_start = time.time()
pipe = get_torch_pipeline(model_name, disable_safety_checker, enable_torch_compile, use_xformers)
load_end = time.time()
print(f"Model loading took {load_end - load_start} seconds")
image_filename_prefix = get_image_filename_prefix("torch", model_name, batch_size, disable_safety_checker)
if not enable_torch_compile:
with torch.inference_mode():
result = run_torch_pipeline(
pipe,
batch_size,
image_filename_prefix,
height,
width,
steps,
num_prompts,
batch_count,
start_memory,
memory_monitor_type,
)
else:
result = run_torch_pipeline(
pipe,
batch_size,
image_filename_prefix,
height,
width,
steps,
num_prompts,
batch_count,
start_memory,
memory_monitor_type,
)
result.update(
{
"model_name": model_name,
"directory": None,
"provider": "compile" if enable_torch_compile else "xformers" if use_xformers else "default",
"disable_safety_checker": disable_safety_checker,
"enable_cuda_graph": False,
}
)
return result
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument(
"-e",
"--engine",
required=False,
type=str,
default="onnxruntime",
choices=["onnxruntime", "optimum", "torch", "tensorrt"],
help="Engines to benchmark. Default is onnxruntime.",
)
parser.add_argument(
"-r",
"--provider",
required=False,
type=str,
default="cuda",
choices=list(PROVIDERS.keys()),
help="Provider to benchmark. Default is CUDAExecutionProvider.",
)
parser.add_argument(
"-t",
"--tuning",
action="store_true",
help="Enable TunableOp and tuning. "
"This will incur longer warmup latency, and is mandatory for some operators of ROCm EP.",
)
parser.add_argument(
"-v",
"--version",
required=False,
type=str,
choices=list(SD_MODELS.keys()),
default="1.5",
help="Stable diffusion version like 1.5, 2.0 or 2.1. Default is 1.5.",
)
parser.add_argument(
"-p",
"--pipeline",
required=False,
type=str,
default=None,
help="Directory of saved onnx pipeline. It could be the output directory of optimize_pipeline.py.",
)
parser.add_argument(
"-w",
"--work_dir",
required=False,
type=str,
default=".",
help="Root directory to save exported onnx models, built engines etc.",
)
parser.add_argument(
"--enable_safety_checker",
required=False,
action="store_true",
help="Enable safety checker",
)
parser.set_defaults(enable_safety_checker=False)
parser.add_argument(
"--enable_torch_compile",
required=False,
action="store_true",
help="Enable compile unet for PyTorch 2.0",
)
parser.set_defaults(enable_torch_compile=False)
parser.add_argument(
"--use_xformers",
required=False,
action="store_true",
help="Use xformers for PyTorch",
)
parser.set_defaults(use_xformers=False)
parser.add_argument(
"--use_io_binding",
required=False,
action="store_true",
help="Use I/O Binding for Optimum.",
)
parser.set_defaults(use_io_binding=False)
parser.add_argument(
"-b",
"--batch_size",
type=int,
default=1,
choices=[1, 2, 3, 4, 8, 10, 16, 32],
help="Number of images per batch. Default is 1.",
)
parser.add_argument(
"--height",
required=False,
type=int,
default=512,
help="Output image height. Default is 512.",
)
parser.add_argument(
"--width",
required=False,
type=int,
default=512,
help="Output image width. Default is 512.",
)
parser.add_argument(
"-s",
"--steps",
required=False,
type=int,
default=50,
help="Number of steps. Default is 50.",
)
parser.add_argument(
"-n",
"--num_prompts",
required=False,
type=int,
default=10,
help="Number of prompts. Default is 10.",
)
parser.add_argument(
"-c",
"--batch_count",
required=False,
type=int,
choices=range(1, 11),
default=5,
help="Number of batches to test. Default is 5.",
)
parser.add_argument(
"-m",
"--max_trt_batch_size",
required=False,
type=int,
choices=range(1, 16),
default=4,
help="Maximum batch size for TensorRT. Change the value may trigger TensorRT engine rebuild. Default is 4.",
)
parser.add_argument(
"-g",
"--enable_cuda_graph",
required=False,
action="store_true",
help="Enable Cuda Graph. Requires onnxruntime >= 1.16",
)
parser.set_defaults(enable_cuda_graph=False)
args = parser.parse_args()
return args
def print_loaded_libraries(cuda_related_only=True):
import psutil
p = psutil.Process(os.getpid())
for lib in p.memory_maps():
if (not cuda_related_only) or any(x in lib.path for x in ("libcu", "libnv", "tensorrt")):
print(lib.path)
def main():
args = parse_arguments()
print(args)
if args.engine == "onnxruntime":
if args.version in ["2.1"]:
# Set a flag to avoid overflow in attention, which causes black image output in SD 2.1 model.
# The environment variables shall be set before the first run of Attention or MultiHeadAttention operator.
os.environ["ORT_DISABLE_TRT_FLASH_ATTENTION"] = "1"
from packaging import version
from onnxruntime import __version__ as ort_version
if version.parse(ort_version) == version.parse("1.16.0"):
# ORT 1.16 has a bug that might trigger Attention RuntimeError when latest fusion script is applied on clip model.
# The walkaround is to enable fused causal attention, or disable Attention fusion for clip model.
os.environ["ORT_ENABLE_FUSED_CAUSAL_ATTENTION"] = "1"
if args.enable_cuda_graph:
if not (args.engine == "onnxruntime" and args.provider in ["cuda", "tensorrt"] and args.pipeline is None):
raise ValueError("The stable diffusion pipeline does not support CUDA graph.")
if version.parse(ort_version) < version.parse("1.16"):
raise ValueError("CUDA graph requires ONNX Runtime 1.16 or later")
coloredlogs.install(fmt="%(funcName)20s: %(message)s")
memory_monitor_type = "rocm" if args.provider == "rocm" else "cuda"
start_memory = measure_gpu_memory(memory_monitor_type, None)
print("GPU memory used before loading models:", start_memory)
sd_model = SD_MODELS[args.version]
provider = PROVIDERS[args.provider]
if args.engine == "onnxruntime" and args.provider == "tensorrt":
if "xl" in args.version:
print("Testing Txt2ImgXLPipeline with static input shape. Backend is ORT TensorRT EP.")
result = run_ort_trt_xl(
work_dir=args.work_dir,
version=args.version,
batch_size=args.batch_size,
disable_safety_checker=True,
height=args.height,
width=args.width,
steps=args.steps,
num_prompts=args.num_prompts,
batch_count=args.batch_count,
start_memory=start_memory,
memory_monitor_type=memory_monitor_type,
max_batch_size=args.max_trt_batch_size,
nvtx_profile=False,
use_cuda_graph=args.enable_cuda_graph,
)
else:
print("Testing Txt2ImgPipeline with static input shape. Backend is ORT TensorRT EP.")
result = run_ort_trt_static(
work_dir=args.work_dir,
version=args.version,
batch_size=args.batch_size,
disable_safety_checker=not args.enable_safety_checker,
height=args.height,
width=args.width,
steps=args.steps,
num_prompts=args.num_prompts,
batch_count=args.batch_count,
start_memory=start_memory,
memory_monitor_type=memory_monitor_type,
max_batch_size=args.max_trt_batch_size,
nvtx_profile=False,
use_cuda_graph=args.enable_cuda_graph,
)
elif args.engine == "optimum" and provider == "CUDAExecutionProvider":
if "xl" in args.version:
os.environ["ORT_ENABLE_FUSED_CAUSAL_ATTENTION"] = "1"
result = run_optimum_ort(
model_name=sd_model,
directory=args.pipeline,
provider=provider,
batch_size=args.batch_size,
disable_safety_checker=not args.enable_safety_checker,
height=args.height,
width=args.width,
steps=args.steps,
num_prompts=args.num_prompts,
batch_count=args.batch_count,
start_memory=start_memory,
memory_monitor_type=memory_monitor_type,
use_io_binding=args.use_io_binding,
)
elif args.engine == "onnxruntime":
assert args.pipeline and os.path.isdir(
args.pipeline
), "--pipeline should be specified for the directory of ONNX models"
print(f"Testing diffusers StableDiffusionPipeline with {provider} provider and tuning={args.tuning}")
result = run_ort(
model_name=sd_model,
directory=args.pipeline,
provider=provider,
batch_size=args.batch_size,
disable_safety_checker=not args.enable_safety_checker,
height=args.height,
width=args.width,
steps=args.steps,
num_prompts=args.num_prompts,
batch_count=args.batch_count,
start_memory=start_memory,
memory_monitor_type=memory_monitor_type,
tuning=args.tuning,
)
elif args.engine == "tensorrt" and "xl" in args.version:
print("Testing Txt2ImgXLPipeline with static input shape. Backend is TensorRT.")
result = run_tensorrt_static_xl(
work_dir=args.work_dir,
version=args.version,
batch_size=args.batch_size,
disable_safety_checker=True,
height=args.height,
width=args.width,
steps=args.steps,
num_prompts=args.num_prompts,
batch_count=args.batch_count,
start_memory=start_memory,
memory_monitor_type=memory_monitor_type,
max_batch_size=args.max_trt_batch_size,
nvtx_profile=False,
use_cuda_graph=args.enable_cuda_graph,
)
elif args.engine == "tensorrt":
print("Testing Txt2ImgPipeline with static input shape. Backend is TensorRT.")
result = run_tensorrt_static(
work_dir=args.work_dir,
version=args.version,
model_name=sd_model,
batch_size=args.batch_size,
disable_safety_checker=True,
height=args.height,
width=args.width,
steps=args.steps,
num_prompts=args.num_prompts,
batch_count=args.batch_count,
start_memory=start_memory,
memory_monitor_type=memory_monitor_type,
max_batch_size=args.max_trt_batch_size,
nvtx_profile=False,
use_cuda_graph=args.enable_cuda_graph,
)
else:
print(
f"Testing Txt2ImgPipeline with dynamic input shape. Backend is PyTorch: compile={args.enable_torch_compile}, xformers={args.use_xformers}."
)
result = run_torch(
model_name=sd_model,
batch_size=args.batch_size,
disable_safety_checker=not args.enable_safety_checker,
enable_torch_compile=args.enable_torch_compile,
use_xformers=args.use_xformers,
height=args.height,
width=args.width,
steps=args.steps,
num_prompts=args.num_prompts,
batch_count=args.batch_count,
start_memory=start_memory,
memory_monitor_type=memory_monitor_type,
)
print(result)
with open("benchmark_result.csv", mode="a", newline="") as csv_file:
column_names = [
"model_name",
"directory",
"engine",
"version",
"provider",
"disable_safety_checker",
"height",
"width",
"steps",
"batch_size",
"batch_count",
"num_prompts",
"average_latency",
"median_latency",
"first_run_memory_MB",
"second_run_memory_MB",
"enable_cuda_graph",
]
csv_writer = csv.DictWriter(csv_file, fieldnames=column_names)
csv_writer.writeheader()
csv_writer.writerow(result)
# Show loaded DLLs when steps == 1 for debugging purpose.
if args.steps == 1:
print_loaded_libraries(args.provider in ["cuda", "tensorrt"])
if __name__ == "__main__":
import traceback
try:
main()
except Exception:
traceback.print_exception(*sys.exc_info())