[CUDA] stable diffusion benchmark allows IO binding for optimum (#22834)

### Description

Update stable diffusion benchmark:
(1) allow IO binding for optimum.
(2) do not use num_images_per_prompt across all engines for fair
comparison.

Example to run benchmark of optimum on stable diffusion 1.5:
```
git clone https://github.com/tianleiwu/optimum
cd optimum
git checkout tlwu/diffusers-io-binding
pip install -e .

pip install -U onnxruntime-gpu
git clone https://github.com/microsoft/onnxruntime
cd onnxruntime/onnxruntime/python/tools/transformers/models/stable_diffusion
git checkout tlwu/benchmark_sd_optimum_io_binding
pip install -r requirements/cuda12/requirements.txt

optimum-cli export onnx --model runwayml/stable-diffusion-v1-5  --task text-to-image ./sd_onnx_fp32

python optimize_pipeline.py -i ./sd_onnx_fp32 -o ./sd_onnx_fp16 --float16
python benchmark.py -e optimum -r cuda -v 1.5 -p ./sd_onnx_fp16
python benchmark.py -e optimum -r cuda -v 1.5 -p ./sd_onnx_fp16 --use_io_binding
```

Example output in H100_80GB_HBM3: 572 ms with IO Binding; 588 ms without
IO Binding; IO binding gains 16ms, or 2.7%,

### Motivation and Context

Optimum is working on enabling I/O binding:
https://github.com/huggingface/optimum/pull/2056. This could help
testing the impact of I/O binding on the performance of the stable
diffusion.
This commit is contained in:
Tianlei Wu 2024-11-14 00:09:07 -08:00 committed by GitHub
parent dd99e34d66
commit 09c98433e7
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -51,6 +51,10 @@ def example_prompts():
return prompts, negative_prompt
def warmup_prompts():
return "warm up", "bad"
def measure_gpu_memory(monitor_type, func, start_memory=None):
return measure_memory(is_gpu=True, func=func, monitor_type=monitor_type, start_memory=start_memory)
@ -136,7 +140,14 @@ def run_ort_pipeline(
prompts, negative_prompt = example_prompts()
def warmup():
pipe("warm up", height, width, num_inference_steps=steps, num_images_per_prompt=batch_size)
prompt, negative = warmup_prompts()
pipe(
prompt=[prompt] * batch_size,
height=height,
width=width,
num_inference_steps=steps,
negative_prompt=[negative] * batch_size,
)
# Run warm up, and measure GPU memory of two runs
# cuDNN/MIOpen The first run has algo search so it might need more memory)
@ -149,22 +160,20 @@ def run_ort_pipeline(
for i, prompt in enumerate(prompts):
if i >= num_prompts:
break
for j in range(batch_count):
inference_start = time.time()
images = pipe(
[prompt] * batch_size,
height,
width,
num_inference_steps=steps,
negative_prompt=[negative_prompt] * batch_size,
guidance_scale=7.5,
).images
inference_end = time.time()
latency = inference_end - inference_start
latency_list.append(latency)
print(f"Inference took {latency:.3f} seconds")
for k, image in enumerate(images):
image.save(f"{image_filename_prefix}_{i}_{j}_{k}.jpg")
inference_start = time.time()
images = pipe(
prompt=[prompt] * batch_size,
height=height,
width=width,
num_inference_steps=steps,
negative_prompt=[negative_prompt] * batch_size,
).images
inference_end = time.time()
latency = inference_end - inference_start
latency_list.append(latency)
print(f"Inference took {latency:.3f} seconds")
for k, image in enumerate(images):
image.save(f"{image_filename_prefix}_{i}_{k}.jpg")
from onnxruntime import __version__ as ort_version
@ -200,7 +209,14 @@ def run_torch_pipeline(
# total 2 runs of warm up, and measure GPU memory for CUDA EP
def warmup():
pipe("warm up", height, width, num_inference_steps=steps, num_images_per_prompt=batch_size)
prompt, negative = warmup_prompts()
pipe(
prompt=[prompt] * batch_size,
height=height,
width=width,
num_inference_steps=steps,
negative_prompt=[negative] * batch_size,
)
# Run warm up, and measure GPU memory of two runs (The first run has cuDNN algo search so it might need more memory)
first_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
@ -215,25 +231,23 @@ def run_torch_pipeline(
if i >= num_prompts:
break
torch.cuda.synchronize()
for j in range(batch_count):
inference_start = time.time()
images = pipe(
prompt=[prompt] * batch_size,
height=height,
width=width,
num_inference_steps=steps,
guidance_scale=7.5,
negative_prompt=[negative_prompt] * batch_size,
generator=None, # torch.Generator
).images
inference_start = time.time()
images = pipe(
prompt=[prompt] * batch_size,
height=height,
width=width,
num_inference_steps=steps,
negative_prompt=[negative_prompt] * batch_size,
generator=None, # torch.Generator
).images
torch.cuda.synchronize()
inference_end = time.time()
latency = inference_end - inference_start
latency_list.append(latency)
print(f"Inference took {latency:.3f} seconds")
for k, image in enumerate(images):
image.save(f"{image_filename_prefix}_{i}_{j}_{k}.jpg")
torch.cuda.synchronize()
inference_end = time.time()
latency = inference_end - inference_start
latency_list.append(latency)
print(f"Inference took {latency:.3f} seconds")
for k, image in enumerate(images):
image.save(f"{image_filename_prefix}_{i}_{k}.jpg")
return {
"engine": "torch",
@ -306,6 +320,7 @@ def get_optimum_ort_pipeline(
directory: str,
provider="CUDAExecutionProvider",
disable_safety_checker: bool = True,
use_io_binding: bool = False,
):
from optimum.onnxruntime import ORTStableDiffusionPipeline, ORTStableDiffusionXLPipeline
@ -321,7 +336,7 @@ def get_optimum_ort_pipeline(
pipeline = ORTStableDiffusionPipeline.from_pretrained(
directory,
provider=provider,
use_io_binding=False, # Not supported by Optimum version 1.17.1 at the time of verification.
use_io_binding=use_io_binding,
)
elif "xl" in model_name:
pipeline = ORTStableDiffusionXLPipeline.from_pretrained(
@ -337,7 +352,7 @@ def get_optimum_ort_pipeline(
model_name,
export=True,
provider=provider,
use_io_binding=False, # Not supported by Optimum version 1.17.1 at the time of verification.
use_io_binding=use_io_binding,
)
pipeline.save_pretrained(directory)
@ -359,15 +374,33 @@ def run_optimum_ort_pipeline(
batch_count,
start_memory,
memory_monitor_type,
use_num_images_per_prompt=False,
):
from optimum.onnxruntime import ORTStableDiffusionPipeline, ORTStableDiffusionXLPipeline
assert isinstance(pipe, (ORTStableDiffusionPipeline, ORTStableDiffusionXLPipeline))
prompts = example_prompts()
prompts, negative_prompt = example_prompts()
def warmup():
pipe("warm up", height, width, num_inference_steps=steps, num_images_per_prompt=batch_size)
prompt, negative = warmup_prompts()
if use_num_images_per_prompt:
pipe(
prompt=prompt,
height=height,
width=width,
num_inference_steps=steps,
negative_prompt=negative,
num_images_per_prompt=batch_count,
)
else:
pipe(
prompt=[prompt] * batch_size,
height=height,
width=width,
num_inference_steps=steps,
negative_prompt=[negative] * batch_size,
)
# Run warm up, and measure GPU memory of two runs.
# The first run has algo search for cuDNN/MIOpen, so it might need more memory.
@ -380,23 +413,30 @@ def run_optimum_ort_pipeline(
for i, prompt in enumerate(prompts):
if i >= num_prompts:
break
for j in range(batch_count):
inference_start = time.time()
inference_start = time.time()
if use_num_images_per_prompt:
images = pipe(
prompt,
height,
width,
prompt=prompt,
height=height,
width=width,
num_inference_steps=steps,
negative_prompt=None,
guidance_scale=0.0, # 7.5
negative_prompt=negative_prompt,
num_images_per_prompt=batch_size,
).images
inference_end = time.time()
latency = inference_end - inference_start
latency_list.append(latency)
print(f"Inference took {latency:.3f} seconds")
for k, image in enumerate(images):
image.save(f"{image_filename_prefix}_{i}_{j}_{k}.jpg")
else:
images = pipe(
prompt=[prompt] * batch_size,
height=height,
width=width,
num_inference_steps=steps,
negative_prompt=[negative_prompt] * batch_size,
).images
inference_end = time.time()
latency = inference_end - inference_start
latency_list.append(latency)
print(f"Inference took {latency:.3f} seconds")
for k, image in enumerate(images):
image.save(f"{image_filename_prefix}_{i}_{k}.jpg")
from onnxruntime import __version__ as ort_version
@ -429,9 +469,12 @@ def run_optimum_ort(
batch_count: int,
start_memory,
memory_monitor_type,
use_io_binding: bool = False,
):
load_start = time.time()
pipe = get_optimum_ort_pipeline(model_name, directory, provider, disable_safety_checker)
pipe = get_optimum_ort_pipeline(
model_name, directory, provider, disable_safety_checker, use_io_binding=use_io_binding
)
load_end = time.time()
print(f"Model loading took {load_end - load_start} seconds")
@ -530,9 +573,8 @@ def run_ort_trt_static(
pipeline.load_resources(height, width, batch_size)
def warmup():
pipeline.run(
["warm up"] * batch_size, ["negative"] * batch_size, height, width, denoising_steps=steps, warmup=True
)
prompt, negative = warmup_prompts()
pipeline.run([prompt] * batch_size, [negative] * batch_size, height, width, denoising_steps=steps)
# Run warm up, and measure GPU memory of two runs
# The first run has algo search so it might need more memory
@ -548,24 +590,23 @@ def run_ort_trt_static(
for i, prompt in enumerate(prompts):
if i >= num_prompts:
break
for j in range(batch_count):
inference_start = time.time()
# Use warmup mode here since non-warmup mode will save image to disk.
images, pipeline_time = pipeline.run(
[prompt] * batch_size,
[negative_prompt] * batch_size,
height,
width,
denoising_steps=steps,
guidance=7.5,
seed=123,
)
inference_end = time.time()
latency = inference_end - inference_start
latency_list.append(latency)
print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
for k, image in enumerate(images):
image.save(f"{image_filename_prefix}_{i}_{j}_{k}.jpg")
inference_start = time.time()
# Use warmup mode here since non-warmup mode will save image to disk.
images, pipeline_time = pipeline.run(
[prompt] * batch_size,
[negative_prompt] * batch_size,
height,
width,
denoising_steps=steps,
guidance=7.5,
seed=123,
)
inference_end = time.time()
latency = inference_end - inference_start
latency_list.append(latency)
print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
for k, image in enumerate(images):
image.save(f"{image_filename_prefix}_{i}_{k}.jpg")
pipeline.teardown()
@ -671,9 +712,8 @@ def run_tensorrt_static(
pipeline.load_resources(height, width, batch_size)
def warmup():
pipeline.run(
["warm up"] * batch_size, ["negative"] * batch_size, height, width, denoising_steps=steps, warmup=True
)
prompt, negative = warmup_prompts()
pipeline.run([prompt] * batch_size, [negative] * batch_size, height, width, denoising_steps=steps)
# Run warm up, and measure GPU memory of two runs
# The first run has algo search so it might need more memory
@ -689,24 +729,22 @@ def run_tensorrt_static(
for i, prompt in enumerate(prompts):
if i >= num_prompts:
break
for j in range(batch_count):
inference_start = time.time()
# Use warmup mode here since non-warmup mode will save image to disk.
images, pipeline_time = pipeline.run(
[prompt] * batch_size,
[negative_prompt] * batch_size,
height,
width,
denoising_steps=steps,
guidance=7.5,
seed=123,
)
inference_end = time.time()
latency = inference_end - inference_start
latency_list.append(latency)
print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
for k, image in enumerate(images):
image.save(f"{image_filename_prefix}_{i}_{j}_{k}.jpg")
inference_start = time.time()
# Use warmup mode here since non-warmup mode will save image to disk.
images, pipeline_time = pipeline.run(
[prompt] * batch_size,
[negative_prompt] * batch_size,
height,
width,
denoising_steps=steps,
seed=123,
)
inference_end = time.time()
latency = inference_end - inference_start
latency_list.append(latency)
print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
for k, image in enumerate(images):
image.save(f"{image_filename_prefix}_{i}_{k}.jpg")
pipeline.teardown()
@ -828,7 +866,8 @@ def run_tensorrt_static_xl(
)
def warmup():
run_sd_xl_inference(["warm up"] * batch_size, ["negative"] * batch_size)
prompt, negative = warmup_prompts()
run_sd_xl_inference([prompt] * batch_size, [negative] * batch_size)
# Run warm up, and measure GPU memory of two runs
# The first run has algo search so it might need more memory
@ -845,20 +884,15 @@ def run_tensorrt_static_xl(
for i, prompt in enumerate(prompts):
if i >= num_prompts:
break
for j in range(batch_count):
inference_start = time.time()
# Use warmup mode here since non-warmup mode will save image to disk.
if nvtx_profile:
cudart.cudaProfilerStart()
images, pipeline_time = run_sd_xl_inference([prompt] * batch_size, [negative_prompt] * batch_size, seed=123)
if nvtx_profile:
cudart.cudaProfilerStop()
inference_end = time.time()
latency = inference_end - inference_start
latency_list.append(latency)
print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
for k, image in enumerate(images):
image.save(f"{image_filename_prefix}_{i}_{j}_{k}.png")
inference_start = time.time()
# Use warmup mode here since non-warmup mode will save image to disk.
images, pipeline_time = run_sd_xl_inference([prompt] * batch_size, [negative_prompt] * batch_size, seed=123)
inference_end = time.time()
latency = inference_end - inference_start
latency_list.append(latency)
print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
for k, image in enumerate(images):
image.save(f"{image_filename_prefix}_{i}_{k}.png")
pipeline.teardown()
@ -911,8 +945,6 @@ def run_ort_trt_xl(
opt_batch_size=batch_size,
)
from cuda import cudart
assert batch_size <= max_batch_size
pipeline.load_resources(height, width, batch_size)
@ -929,7 +961,8 @@ def run_ort_trt_xl(
)
def warmup():
run_sd_xl_inference(["warm up"] * batch_size, ["negative"] * batch_size)
prompt, negative = warmup_prompts()
run_sd_xl_inference([prompt] * batch_size, [negative] * batch_size)
# Run warm up, and measure GPU memory of two runs
# The first run has algo search so it might need more memory
@ -946,22 +979,17 @@ def run_ort_trt_xl(
for i, prompt in enumerate(prompts):
if i >= num_prompts:
break
for j in range(batch_count):
inference_start = time.time()
# Use warmup mode here since non-warmup mode will save image to disk.
if nvtx_profile:
cudart.cudaProfilerStart()
images, pipeline_time = run_sd_xl_inference([prompt] * batch_size, [negative_prompt] * batch_size, seed=123)
if nvtx_profile:
cudart.cudaProfilerStop()
inference_end = time.time()
latency = inference_end - inference_start
latency_list.append(latency)
print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
for k, image in enumerate(images):
filename = f"{image_filename_prefix}_{i}_{j}_{k}.png"
image.save(filename)
print("Image saved to", filename)
inference_start = time.time()
# Use warmup mode here since non-warmup mode will save image to disk.
images, pipeline_time = run_sd_xl_inference([prompt] * batch_size, [negative_prompt] * batch_size, seed=123)
inference_end = time.time()
latency = inference_end - inference_start
latency_list.append(latency)
print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
for k, image in enumerate(images):
filename = f"{image_filename_prefix}_{i}_{k}.png"
image.save(filename)
print("Image saved to", filename)
pipeline.teardown()
@ -1137,6 +1165,14 @@ def parse_arguments():
)
parser.set_defaults(use_xformers=False)
parser.add_argument(
"--use_io_binding",
required=False,
action="store_true",
help="Use I/O Binding for Optimum.",
)
parser.set_defaults(use_io_binding=False)
parser.add_argument(
"-b",
"--batch_size",
@ -1176,8 +1212,8 @@ def parse_arguments():
"--num_prompts",
required=False,
type=int,
default=1,
help="Number of prompts. Default is 1.",
default=10,
help="Number of prompts. Default is 10.",
)
parser.add_argument(
@ -1312,6 +1348,7 @@ def main():
batch_count=args.batch_count,
start_memory=start_memory,
memory_monitor_type=memory_monitor_type,
use_io_binding=args.use_io_binding,
)
elif args.engine == "onnxruntime":
assert args.pipeline and os.path.isdir(