mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-07-04 04:07:22 +00:00
[CUDA] stable diffusion benchmark allows IO binding for optimum (#22834)
### Description Update stable diffusion benchmark: (1) allow IO binding for optimum. (2) do not use num_images_per_prompt across all engines for fair comparison. Example to run benchmark of optimum on stable diffusion 1.5: ``` git clone https://github.com/tianleiwu/optimum cd optimum git checkout tlwu/diffusers-io-binding pip install -e . pip install -U onnxruntime-gpu git clone https://github.com/microsoft/onnxruntime cd onnxruntime/onnxruntime/python/tools/transformers/models/stable_diffusion git checkout tlwu/benchmark_sd_optimum_io_binding pip install -r requirements/cuda12/requirements.txt optimum-cli export onnx --model runwayml/stable-diffusion-v1-5 --task text-to-image ./sd_onnx_fp32 python optimize_pipeline.py -i ./sd_onnx_fp32 -o ./sd_onnx_fp16 --float16 python benchmark.py -e optimum -r cuda -v 1.5 -p ./sd_onnx_fp16 python benchmark.py -e optimum -r cuda -v 1.5 -p ./sd_onnx_fp16 --use_io_binding ``` Example output in H100_80GB_HBM3: 572 ms with IO Binding; 588 ms without IO Binding; IO binding gains 16ms, or 2.7%, ### Motivation and Context Optimum is working on enabling I/O binding: https://github.com/huggingface/optimum/pull/2056. This could help testing the impact of I/O binding on the performance of the stable diffusion.
This commit is contained in:
parent
dd99e34d66
commit
09c98433e7
1 changed files with 169 additions and 132 deletions
|
|
@ -51,6 +51,10 @@ def example_prompts():
|
|||
return prompts, negative_prompt
|
||||
|
||||
|
||||
def warmup_prompts():
|
||||
return "warm up", "bad"
|
||||
|
||||
|
||||
def measure_gpu_memory(monitor_type, func, start_memory=None):
|
||||
return measure_memory(is_gpu=True, func=func, monitor_type=monitor_type, start_memory=start_memory)
|
||||
|
||||
|
|
@ -136,7 +140,14 @@ def run_ort_pipeline(
|
|||
prompts, negative_prompt = example_prompts()
|
||||
|
||||
def warmup():
|
||||
pipe("warm up", height, width, num_inference_steps=steps, num_images_per_prompt=batch_size)
|
||||
prompt, negative = warmup_prompts()
|
||||
pipe(
|
||||
prompt=[prompt] * batch_size,
|
||||
height=height,
|
||||
width=width,
|
||||
num_inference_steps=steps,
|
||||
negative_prompt=[negative] * batch_size,
|
||||
)
|
||||
|
||||
# Run warm up, and measure GPU memory of two runs
|
||||
# cuDNN/MIOpen The first run has algo search so it might need more memory)
|
||||
|
|
@ -149,22 +160,20 @@ def run_ort_pipeline(
|
|||
for i, prompt in enumerate(prompts):
|
||||
if i >= num_prompts:
|
||||
break
|
||||
for j in range(batch_count):
|
||||
inference_start = time.time()
|
||||
images = pipe(
|
||||
[prompt] * batch_size,
|
||||
height,
|
||||
width,
|
||||
num_inference_steps=steps,
|
||||
negative_prompt=[negative_prompt] * batch_size,
|
||||
guidance_scale=7.5,
|
||||
).images
|
||||
inference_end = time.time()
|
||||
latency = inference_end - inference_start
|
||||
latency_list.append(latency)
|
||||
print(f"Inference took {latency:.3f} seconds")
|
||||
for k, image in enumerate(images):
|
||||
image.save(f"{image_filename_prefix}_{i}_{j}_{k}.jpg")
|
||||
inference_start = time.time()
|
||||
images = pipe(
|
||||
prompt=[prompt] * batch_size,
|
||||
height=height,
|
||||
width=width,
|
||||
num_inference_steps=steps,
|
||||
negative_prompt=[negative_prompt] * batch_size,
|
||||
).images
|
||||
inference_end = time.time()
|
||||
latency = inference_end - inference_start
|
||||
latency_list.append(latency)
|
||||
print(f"Inference took {latency:.3f} seconds")
|
||||
for k, image in enumerate(images):
|
||||
image.save(f"{image_filename_prefix}_{i}_{k}.jpg")
|
||||
|
||||
from onnxruntime import __version__ as ort_version
|
||||
|
||||
|
|
@ -200,7 +209,14 @@ def run_torch_pipeline(
|
|||
|
||||
# total 2 runs of warm up, and measure GPU memory for CUDA EP
|
||||
def warmup():
|
||||
pipe("warm up", height, width, num_inference_steps=steps, num_images_per_prompt=batch_size)
|
||||
prompt, negative = warmup_prompts()
|
||||
pipe(
|
||||
prompt=[prompt] * batch_size,
|
||||
height=height,
|
||||
width=width,
|
||||
num_inference_steps=steps,
|
||||
negative_prompt=[negative] * batch_size,
|
||||
)
|
||||
|
||||
# Run warm up, and measure GPU memory of two runs (The first run has cuDNN algo search so it might need more memory)
|
||||
first_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
|
||||
|
|
@ -215,25 +231,23 @@ def run_torch_pipeline(
|
|||
if i >= num_prompts:
|
||||
break
|
||||
torch.cuda.synchronize()
|
||||
for j in range(batch_count):
|
||||
inference_start = time.time()
|
||||
images = pipe(
|
||||
prompt=[prompt] * batch_size,
|
||||
height=height,
|
||||
width=width,
|
||||
num_inference_steps=steps,
|
||||
guidance_scale=7.5,
|
||||
negative_prompt=[negative_prompt] * batch_size,
|
||||
generator=None, # torch.Generator
|
||||
).images
|
||||
inference_start = time.time()
|
||||
images = pipe(
|
||||
prompt=[prompt] * batch_size,
|
||||
height=height,
|
||||
width=width,
|
||||
num_inference_steps=steps,
|
||||
negative_prompt=[negative_prompt] * batch_size,
|
||||
generator=None, # torch.Generator
|
||||
).images
|
||||
|
||||
torch.cuda.synchronize()
|
||||
inference_end = time.time()
|
||||
latency = inference_end - inference_start
|
||||
latency_list.append(latency)
|
||||
print(f"Inference took {latency:.3f} seconds")
|
||||
for k, image in enumerate(images):
|
||||
image.save(f"{image_filename_prefix}_{i}_{j}_{k}.jpg")
|
||||
torch.cuda.synchronize()
|
||||
inference_end = time.time()
|
||||
latency = inference_end - inference_start
|
||||
latency_list.append(latency)
|
||||
print(f"Inference took {latency:.3f} seconds")
|
||||
for k, image in enumerate(images):
|
||||
image.save(f"{image_filename_prefix}_{i}_{k}.jpg")
|
||||
|
||||
return {
|
||||
"engine": "torch",
|
||||
|
|
@ -306,6 +320,7 @@ def get_optimum_ort_pipeline(
|
|||
directory: str,
|
||||
provider="CUDAExecutionProvider",
|
||||
disable_safety_checker: bool = True,
|
||||
use_io_binding: bool = False,
|
||||
):
|
||||
from optimum.onnxruntime import ORTStableDiffusionPipeline, ORTStableDiffusionXLPipeline
|
||||
|
||||
|
|
@ -321,7 +336,7 @@ def get_optimum_ort_pipeline(
|
|||
pipeline = ORTStableDiffusionPipeline.from_pretrained(
|
||||
directory,
|
||||
provider=provider,
|
||||
use_io_binding=False, # Not supported by Optimum version 1.17.1 at the time of verification.
|
||||
use_io_binding=use_io_binding,
|
||||
)
|
||||
elif "xl" in model_name:
|
||||
pipeline = ORTStableDiffusionXLPipeline.from_pretrained(
|
||||
|
|
@ -337,7 +352,7 @@ def get_optimum_ort_pipeline(
|
|||
model_name,
|
||||
export=True,
|
||||
provider=provider,
|
||||
use_io_binding=False, # Not supported by Optimum version 1.17.1 at the time of verification.
|
||||
use_io_binding=use_io_binding,
|
||||
)
|
||||
pipeline.save_pretrained(directory)
|
||||
|
||||
|
|
@ -359,15 +374,33 @@ def run_optimum_ort_pipeline(
|
|||
batch_count,
|
||||
start_memory,
|
||||
memory_monitor_type,
|
||||
use_num_images_per_prompt=False,
|
||||
):
|
||||
from optimum.onnxruntime import ORTStableDiffusionPipeline, ORTStableDiffusionXLPipeline
|
||||
|
||||
assert isinstance(pipe, (ORTStableDiffusionPipeline, ORTStableDiffusionXLPipeline))
|
||||
|
||||
prompts = example_prompts()
|
||||
prompts, negative_prompt = example_prompts()
|
||||
|
||||
def warmup():
|
||||
pipe("warm up", height, width, num_inference_steps=steps, num_images_per_prompt=batch_size)
|
||||
prompt, negative = warmup_prompts()
|
||||
if use_num_images_per_prompt:
|
||||
pipe(
|
||||
prompt=prompt,
|
||||
height=height,
|
||||
width=width,
|
||||
num_inference_steps=steps,
|
||||
negative_prompt=negative,
|
||||
num_images_per_prompt=batch_count,
|
||||
)
|
||||
else:
|
||||
pipe(
|
||||
prompt=[prompt] * batch_size,
|
||||
height=height,
|
||||
width=width,
|
||||
num_inference_steps=steps,
|
||||
negative_prompt=[negative] * batch_size,
|
||||
)
|
||||
|
||||
# Run warm up, and measure GPU memory of two runs.
|
||||
# The first run has algo search for cuDNN/MIOpen, so it might need more memory.
|
||||
|
|
@ -380,23 +413,30 @@ def run_optimum_ort_pipeline(
|
|||
for i, prompt in enumerate(prompts):
|
||||
if i >= num_prompts:
|
||||
break
|
||||
for j in range(batch_count):
|
||||
inference_start = time.time()
|
||||
inference_start = time.time()
|
||||
if use_num_images_per_prompt:
|
||||
images = pipe(
|
||||
prompt,
|
||||
height,
|
||||
width,
|
||||
prompt=prompt,
|
||||
height=height,
|
||||
width=width,
|
||||
num_inference_steps=steps,
|
||||
negative_prompt=None,
|
||||
guidance_scale=0.0, # 7.5
|
||||
negative_prompt=negative_prompt,
|
||||
num_images_per_prompt=batch_size,
|
||||
).images
|
||||
inference_end = time.time()
|
||||
latency = inference_end - inference_start
|
||||
latency_list.append(latency)
|
||||
print(f"Inference took {latency:.3f} seconds")
|
||||
for k, image in enumerate(images):
|
||||
image.save(f"{image_filename_prefix}_{i}_{j}_{k}.jpg")
|
||||
else:
|
||||
images = pipe(
|
||||
prompt=[prompt] * batch_size,
|
||||
height=height,
|
||||
width=width,
|
||||
num_inference_steps=steps,
|
||||
negative_prompt=[negative_prompt] * batch_size,
|
||||
).images
|
||||
inference_end = time.time()
|
||||
latency = inference_end - inference_start
|
||||
latency_list.append(latency)
|
||||
print(f"Inference took {latency:.3f} seconds")
|
||||
for k, image in enumerate(images):
|
||||
image.save(f"{image_filename_prefix}_{i}_{k}.jpg")
|
||||
|
||||
from onnxruntime import __version__ as ort_version
|
||||
|
||||
|
|
@ -429,9 +469,12 @@ def run_optimum_ort(
|
|||
batch_count: int,
|
||||
start_memory,
|
||||
memory_monitor_type,
|
||||
use_io_binding: bool = False,
|
||||
):
|
||||
load_start = time.time()
|
||||
pipe = get_optimum_ort_pipeline(model_name, directory, provider, disable_safety_checker)
|
||||
pipe = get_optimum_ort_pipeline(
|
||||
model_name, directory, provider, disable_safety_checker, use_io_binding=use_io_binding
|
||||
)
|
||||
load_end = time.time()
|
||||
print(f"Model loading took {load_end - load_start} seconds")
|
||||
|
||||
|
|
@ -530,9 +573,8 @@ def run_ort_trt_static(
|
|||
pipeline.load_resources(height, width, batch_size)
|
||||
|
||||
def warmup():
|
||||
pipeline.run(
|
||||
["warm up"] * batch_size, ["negative"] * batch_size, height, width, denoising_steps=steps, warmup=True
|
||||
)
|
||||
prompt, negative = warmup_prompts()
|
||||
pipeline.run([prompt] * batch_size, [negative] * batch_size, height, width, denoising_steps=steps)
|
||||
|
||||
# Run warm up, and measure GPU memory of two runs
|
||||
# The first run has algo search so it might need more memory
|
||||
|
|
@ -548,24 +590,23 @@ def run_ort_trt_static(
|
|||
for i, prompt in enumerate(prompts):
|
||||
if i >= num_prompts:
|
||||
break
|
||||
for j in range(batch_count):
|
||||
inference_start = time.time()
|
||||
# Use warmup mode here since non-warmup mode will save image to disk.
|
||||
images, pipeline_time = pipeline.run(
|
||||
[prompt] * batch_size,
|
||||
[negative_prompt] * batch_size,
|
||||
height,
|
||||
width,
|
||||
denoising_steps=steps,
|
||||
guidance=7.5,
|
||||
seed=123,
|
||||
)
|
||||
inference_end = time.time()
|
||||
latency = inference_end - inference_start
|
||||
latency_list.append(latency)
|
||||
print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
|
||||
for k, image in enumerate(images):
|
||||
image.save(f"{image_filename_prefix}_{i}_{j}_{k}.jpg")
|
||||
inference_start = time.time()
|
||||
# Use warmup mode here since non-warmup mode will save image to disk.
|
||||
images, pipeline_time = pipeline.run(
|
||||
[prompt] * batch_size,
|
||||
[negative_prompt] * batch_size,
|
||||
height,
|
||||
width,
|
||||
denoising_steps=steps,
|
||||
guidance=7.5,
|
||||
seed=123,
|
||||
)
|
||||
inference_end = time.time()
|
||||
latency = inference_end - inference_start
|
||||
latency_list.append(latency)
|
||||
print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
|
||||
for k, image in enumerate(images):
|
||||
image.save(f"{image_filename_prefix}_{i}_{k}.jpg")
|
||||
|
||||
pipeline.teardown()
|
||||
|
||||
|
|
@ -671,9 +712,8 @@ def run_tensorrt_static(
|
|||
pipeline.load_resources(height, width, batch_size)
|
||||
|
||||
def warmup():
|
||||
pipeline.run(
|
||||
["warm up"] * batch_size, ["negative"] * batch_size, height, width, denoising_steps=steps, warmup=True
|
||||
)
|
||||
prompt, negative = warmup_prompts()
|
||||
pipeline.run([prompt] * batch_size, [negative] * batch_size, height, width, denoising_steps=steps)
|
||||
|
||||
# Run warm up, and measure GPU memory of two runs
|
||||
# The first run has algo search so it might need more memory
|
||||
|
|
@ -689,24 +729,22 @@ def run_tensorrt_static(
|
|||
for i, prompt in enumerate(prompts):
|
||||
if i >= num_prompts:
|
||||
break
|
||||
for j in range(batch_count):
|
||||
inference_start = time.time()
|
||||
# Use warmup mode here since non-warmup mode will save image to disk.
|
||||
images, pipeline_time = pipeline.run(
|
||||
[prompt] * batch_size,
|
||||
[negative_prompt] * batch_size,
|
||||
height,
|
||||
width,
|
||||
denoising_steps=steps,
|
||||
guidance=7.5,
|
||||
seed=123,
|
||||
)
|
||||
inference_end = time.time()
|
||||
latency = inference_end - inference_start
|
||||
latency_list.append(latency)
|
||||
print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
|
||||
for k, image in enumerate(images):
|
||||
image.save(f"{image_filename_prefix}_{i}_{j}_{k}.jpg")
|
||||
inference_start = time.time()
|
||||
# Use warmup mode here since non-warmup mode will save image to disk.
|
||||
images, pipeline_time = pipeline.run(
|
||||
[prompt] * batch_size,
|
||||
[negative_prompt] * batch_size,
|
||||
height,
|
||||
width,
|
||||
denoising_steps=steps,
|
||||
seed=123,
|
||||
)
|
||||
inference_end = time.time()
|
||||
latency = inference_end - inference_start
|
||||
latency_list.append(latency)
|
||||
print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
|
||||
for k, image in enumerate(images):
|
||||
image.save(f"{image_filename_prefix}_{i}_{k}.jpg")
|
||||
|
||||
pipeline.teardown()
|
||||
|
||||
|
|
@ -828,7 +866,8 @@ def run_tensorrt_static_xl(
|
|||
)
|
||||
|
||||
def warmup():
|
||||
run_sd_xl_inference(["warm up"] * batch_size, ["negative"] * batch_size)
|
||||
prompt, negative = warmup_prompts()
|
||||
run_sd_xl_inference([prompt] * batch_size, [negative] * batch_size)
|
||||
|
||||
# Run warm up, and measure GPU memory of two runs
|
||||
# The first run has algo search so it might need more memory
|
||||
|
|
@ -845,20 +884,15 @@ def run_tensorrt_static_xl(
|
|||
for i, prompt in enumerate(prompts):
|
||||
if i >= num_prompts:
|
||||
break
|
||||
for j in range(batch_count):
|
||||
inference_start = time.time()
|
||||
# Use warmup mode here since non-warmup mode will save image to disk.
|
||||
if nvtx_profile:
|
||||
cudart.cudaProfilerStart()
|
||||
images, pipeline_time = run_sd_xl_inference([prompt] * batch_size, [negative_prompt] * batch_size, seed=123)
|
||||
if nvtx_profile:
|
||||
cudart.cudaProfilerStop()
|
||||
inference_end = time.time()
|
||||
latency = inference_end - inference_start
|
||||
latency_list.append(latency)
|
||||
print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
|
||||
for k, image in enumerate(images):
|
||||
image.save(f"{image_filename_prefix}_{i}_{j}_{k}.png")
|
||||
inference_start = time.time()
|
||||
# Use warmup mode here since non-warmup mode will save image to disk.
|
||||
images, pipeline_time = run_sd_xl_inference([prompt] * batch_size, [negative_prompt] * batch_size, seed=123)
|
||||
inference_end = time.time()
|
||||
latency = inference_end - inference_start
|
||||
latency_list.append(latency)
|
||||
print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
|
||||
for k, image in enumerate(images):
|
||||
image.save(f"{image_filename_prefix}_{i}_{k}.png")
|
||||
|
||||
pipeline.teardown()
|
||||
|
||||
|
|
@ -911,8 +945,6 @@ def run_ort_trt_xl(
|
|||
opt_batch_size=batch_size,
|
||||
)
|
||||
|
||||
from cuda import cudart
|
||||
|
||||
assert batch_size <= max_batch_size
|
||||
|
||||
pipeline.load_resources(height, width, batch_size)
|
||||
|
|
@ -929,7 +961,8 @@ def run_ort_trt_xl(
|
|||
)
|
||||
|
||||
def warmup():
|
||||
run_sd_xl_inference(["warm up"] * batch_size, ["negative"] * batch_size)
|
||||
prompt, negative = warmup_prompts()
|
||||
run_sd_xl_inference([prompt] * batch_size, [negative] * batch_size)
|
||||
|
||||
# Run warm up, and measure GPU memory of two runs
|
||||
# The first run has algo search so it might need more memory
|
||||
|
|
@ -946,22 +979,17 @@ def run_ort_trt_xl(
|
|||
for i, prompt in enumerate(prompts):
|
||||
if i >= num_prompts:
|
||||
break
|
||||
for j in range(batch_count):
|
||||
inference_start = time.time()
|
||||
# Use warmup mode here since non-warmup mode will save image to disk.
|
||||
if nvtx_profile:
|
||||
cudart.cudaProfilerStart()
|
||||
images, pipeline_time = run_sd_xl_inference([prompt] * batch_size, [negative_prompt] * batch_size, seed=123)
|
||||
if nvtx_profile:
|
||||
cudart.cudaProfilerStop()
|
||||
inference_end = time.time()
|
||||
latency = inference_end - inference_start
|
||||
latency_list.append(latency)
|
||||
print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
|
||||
for k, image in enumerate(images):
|
||||
filename = f"{image_filename_prefix}_{i}_{j}_{k}.png"
|
||||
image.save(filename)
|
||||
print("Image saved to", filename)
|
||||
inference_start = time.time()
|
||||
# Use warmup mode here since non-warmup mode will save image to disk.
|
||||
images, pipeline_time = run_sd_xl_inference([prompt] * batch_size, [negative_prompt] * batch_size, seed=123)
|
||||
inference_end = time.time()
|
||||
latency = inference_end - inference_start
|
||||
latency_list.append(latency)
|
||||
print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
|
||||
for k, image in enumerate(images):
|
||||
filename = f"{image_filename_prefix}_{i}_{k}.png"
|
||||
image.save(filename)
|
||||
print("Image saved to", filename)
|
||||
|
||||
pipeline.teardown()
|
||||
|
||||
|
|
@ -1137,6 +1165,14 @@ def parse_arguments():
|
|||
)
|
||||
parser.set_defaults(use_xformers=False)
|
||||
|
||||
parser.add_argument(
|
||||
"--use_io_binding",
|
||||
required=False,
|
||||
action="store_true",
|
||||
help="Use I/O Binding for Optimum.",
|
||||
)
|
||||
parser.set_defaults(use_io_binding=False)
|
||||
|
||||
parser.add_argument(
|
||||
"-b",
|
||||
"--batch_size",
|
||||
|
|
@ -1176,8 +1212,8 @@ def parse_arguments():
|
|||
"--num_prompts",
|
||||
required=False,
|
||||
type=int,
|
||||
default=1,
|
||||
help="Number of prompts. Default is 1.",
|
||||
default=10,
|
||||
help="Number of prompts. Default is 10.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
|
|
@ -1312,6 +1348,7 @@ def main():
|
|||
batch_count=args.batch_count,
|
||||
start_memory=start_memory,
|
||||
memory_monitor_type=memory_monitor_type,
|
||||
use_io_binding=args.use_io_binding,
|
||||
)
|
||||
elif args.engine == "onnxruntime":
|
||||
assert args.pipeline and os.path.isdir(
|
||||
|
|
|
|||
Loading…
Reference in a new issue