[CUDA] stable diffusion benchmark allows IO binding for optimum (#22834)

### Description Update stable diffusion benchmark: (1) allow IO binding for optimum. (2) do not use num_images_per_prompt across all engines for fair comparison. Example to run benchmark of optimum on stable diffusion 1.5: ``` git clone https://github.com/tianleiwu/optimum cd optimum git checkout tlwu/diffusers-io-binding pip install -e . pip install -U onnxruntime-gpu git clone https://github.com/microsoft/onnxruntime cd onnxruntime/onnxruntime/python/tools/transformers/models/stable_diffusion git checkout tlwu/benchmark_sd_optimum_io_binding pip install -r requirements/cuda12/requirements.txt optimum-cli export onnx --model runwayml/stable-diffusion-v1-5 --task text-to-image ./sd_onnx_fp32 python optimize_pipeline.py -i ./sd_onnx_fp32 -o ./sd_onnx_fp16 --float16 python benchmark.py -e optimum -r cuda -v 1.5 -p ./sd_onnx_fp16 python benchmark.py -e optimum -r cuda -v 1.5 -p ./sd_onnx_fp16 --use_io_binding ``` Example output in H100_80GB_HBM3: 572 ms with IO Binding; 588 ms without IO Binding; IO binding gains 16ms, or 2.7%, ### Motivation and Context Optimum is working on enabling I/O binding: https://github.com/huggingface/optimum/pull/2056. This could help testing the impact of I/O binding on the performance of the stable diffusion.
2026-07-24 19:43:35 +00:00 · 2024-11-14 00:09:07 -08:00 · 2024-11-14 00:09:07 -08:00 · 09c98433e7
commit 09c98433e7
parent dd99e34d66
1 changed files with 169 additions and 132 deletions
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py
@ -51,6 +51,10 @@ def example_prompts():
    return prompts, negative_prompt


+def warmup_prompts():
+    return "warm up", "bad"
+
+
 def measure_gpu_memory(monitor_type, func, start_memory=None):
    return measure_memory(is_gpu=True, func=func, monitor_type=monitor_type, start_memory=start_memory)

@ -136,7 +140,14 @@ def run_ort_pipeline(
    prompts, negative_prompt = example_prompts()

    def warmup():
-        pipe("warm up", height, width, num_inference_steps=steps, num_images_per_prompt=batch_size)
+        prompt, negative = warmup_prompts()
+        pipe(
+            prompt=[prompt] * batch_size,
+            height=height,
+            width=width,
+            num_inference_steps=steps,
+            negative_prompt=[negative] * batch_size,
+        )

    # Run warm up, and measure GPU memory of two runs
    # cuDNN/MIOpen The first run has  algo search so it might need more memory)
@ -149,22 +160,20 @@ def run_ort_pipeline(
    for i, prompt in enumerate(prompts):
        if i >= num_prompts:
            break
-        for j in range(batch_count):
-            inference_start = time.time()
-            images = pipe(
-                [prompt] * batch_size,
-                height,
-                width,
-                num_inference_steps=steps,
-                negative_prompt=[negative_prompt] * batch_size,
-                guidance_scale=7.5,
-            ).images
-            inference_end = time.time()
-            latency = inference_end - inference_start
-            latency_list.append(latency)
-            print(f"Inference took {latency:.3f} seconds")
-            for k, image in enumerate(images):
-                image.save(f"{image_filename_prefix}_{i}_{j}_{k}.jpg")
+        inference_start = time.time()
+        images = pipe(
+            prompt=[prompt] * batch_size,
+            height=height,
+            width=width,
+            num_inference_steps=steps,
+            negative_prompt=[negative_prompt] * batch_size,
+        ).images
+        inference_end = time.time()
+        latency = inference_end - inference_start
+        latency_list.append(latency)
+        print(f"Inference took {latency:.3f} seconds")
+        for k, image in enumerate(images):
+            image.save(f"{image_filename_prefix}_{i}_{k}.jpg")

    from onnxruntime import __version__ as ort_version

@ -200,7 +209,14 @@ def run_torch_pipeline(

    # total 2 runs of warm up, and measure GPU memory for CUDA EP
    def warmup():
-        pipe("warm up", height, width, num_inference_steps=steps, num_images_per_prompt=batch_size)
+        prompt, negative = warmup_prompts()
+        pipe(
+            prompt=[prompt] * batch_size,
+            height=height,
+            width=width,
+            num_inference_steps=steps,
+            negative_prompt=[negative] * batch_size,
+        )

    # Run warm up, and measure GPU memory of two runs (The first run has cuDNN algo search so it might need more memory)
    first_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
@ -215,25 +231,23 @@ def run_torch_pipeline(
        if i >= num_prompts:
            break
        torch.cuda.synchronize()
-        for j in range(batch_count):
-            inference_start = time.time()
-            images = pipe(
-                prompt=[prompt] * batch_size,
-                height=height,
-                width=width,
-                num_inference_steps=steps,
-                guidance_scale=7.5,
-                negative_prompt=[negative_prompt] * batch_size,
-                generator=None,  # torch.Generator
-            ).images
+        inference_start = time.time()
+        images = pipe(
+            prompt=[prompt] * batch_size,
+            height=height,
+            width=width,
+            num_inference_steps=steps,
+            negative_prompt=[negative_prompt] * batch_size,
+            generator=None,  # torch.Generator
+        ).images

-            torch.cuda.synchronize()
-            inference_end = time.time()
-            latency = inference_end - inference_start
-            latency_list.append(latency)
-            print(f"Inference took {latency:.3f} seconds")
-            for k, image in enumerate(images):
-                image.save(f"{image_filename_prefix}_{i}_{j}_{k}.jpg")
+        torch.cuda.synchronize()
+        inference_end = time.time()
+        latency = inference_end - inference_start
+        latency_list.append(latency)
+        print(f"Inference took {latency:.3f} seconds")
+        for k, image in enumerate(images):
+            image.save(f"{image_filename_prefix}_{i}_{k}.jpg")

    return {
        "engine": "torch",
@ -306,6 +320,7 @@ def get_optimum_ort_pipeline(
    directory: str,
    provider="CUDAExecutionProvider",
    disable_safety_checker: bool = True,
+    use_io_binding: bool = False,
 ):
    from optimum.onnxruntime import ORTStableDiffusionPipeline, ORTStableDiffusionXLPipeline

@ -321,7 +336,7 @@ def get_optimum_ort_pipeline(
            pipeline = ORTStableDiffusionPipeline.from_pretrained(
                directory,
                provider=provider,
-                use_io_binding=False,  # Not supported by Optimum version 1.17.1 at the time of verification.
+                use_io_binding=use_io_binding,
            )
    elif "xl" in model_name:
        pipeline = ORTStableDiffusionXLPipeline.from_pretrained(
@ -337,7 +352,7 @@ def get_optimum_ort_pipeline(
            model_name,
            export=True,
            provider=provider,
-            use_io_binding=False,  # Not supported by Optimum version 1.17.1 at the time of verification.
+            use_io_binding=use_io_binding,
        )
        pipeline.save_pretrained(directory)

@ -359,15 +374,33 @@ def run_optimum_ort_pipeline(
    batch_count,
    start_memory,
    memory_monitor_type,
+    use_num_images_per_prompt=False,
 ):
    from optimum.onnxruntime import ORTStableDiffusionPipeline, ORTStableDiffusionXLPipeline

    assert isinstance(pipe, (ORTStableDiffusionPipeline, ORTStableDiffusionXLPipeline))

-    prompts = example_prompts()
+    prompts, negative_prompt = example_prompts()

    def warmup():
-        pipe("warm up", height, width, num_inference_steps=steps, num_images_per_prompt=batch_size)
+        prompt, negative = warmup_prompts()
+        if use_num_images_per_prompt:
+            pipe(
+                prompt=prompt,
+                height=height,
+                width=width,
+                num_inference_steps=steps,
+                negative_prompt=negative,
+                num_images_per_prompt=batch_count,
+            )
+        else:
+            pipe(
+                prompt=[prompt] * batch_size,
+                height=height,
+                width=width,
+                num_inference_steps=steps,
+                negative_prompt=[negative] * batch_size,
+            )

    # Run warm up, and measure GPU memory of two runs.
    # The first run has algo search for cuDNN/MIOpen, so it might need more memory.
@ -380,23 +413,30 @@ def run_optimum_ort_pipeline(
    for i, prompt in enumerate(prompts):
        if i >= num_prompts:
            break
-        for j in range(batch_count):
-            inference_start = time.time()
+        inference_start = time.time()
+        if use_num_images_per_prompt:
            images = pipe(
-                prompt,
-                height,
-                width,
+                prompt=prompt,
+                height=height,
+                width=width,
                num_inference_steps=steps,
-                negative_prompt=None,
-                guidance_scale=0.0,  # 7.5
+                negative_prompt=negative_prompt,
                num_images_per_prompt=batch_size,
            ).images
-            inference_end = time.time()
-            latency = inference_end - inference_start
-            latency_list.append(latency)
-            print(f"Inference took {latency:.3f} seconds")
-            for k, image in enumerate(images):
-                image.save(f"{image_filename_prefix}_{i}_{j}_{k}.jpg")
+        else:
+            images = pipe(
+                prompt=[prompt] * batch_size,
+                height=height,
+                width=width,
+                num_inference_steps=steps,
+                negative_prompt=[negative_prompt] * batch_size,
+            ).images
+        inference_end = time.time()
+        latency = inference_end - inference_start
+        latency_list.append(latency)
+        print(f"Inference took {latency:.3f} seconds")
+        for k, image in enumerate(images):
+            image.save(f"{image_filename_prefix}_{i}_{k}.jpg")

    from onnxruntime import __version__ as ort_version

@ -429,9 +469,12 @@ def run_optimum_ort(
    batch_count: int,
    start_memory,
    memory_monitor_type,
+    use_io_binding: bool = False,
 ):
    load_start = time.time()
-    pipe = get_optimum_ort_pipeline(model_name, directory, provider, disable_safety_checker)
+    pipe = get_optimum_ort_pipeline(
+        model_name, directory, provider, disable_safety_checker, use_io_binding=use_io_binding
+    )
    load_end = time.time()
    print(f"Model loading took {load_end - load_start} seconds")

@ -530,9 +573,8 @@ def run_ort_trt_static(
    pipeline.load_resources(height, width, batch_size)

    def warmup():
-        pipeline.run(
-            ["warm up"] * batch_size, ["negative"] * batch_size, height, width, denoising_steps=steps, warmup=True
-        )
+        prompt, negative = warmup_prompts()
+        pipeline.run([prompt] * batch_size, [negative] * batch_size, height, width, denoising_steps=steps)

    # Run warm up, and measure GPU memory of two runs
    # The first run has algo search so it might need more memory
@ -548,24 +590,23 @@ def run_ort_trt_static(
    for i, prompt in enumerate(prompts):
        if i >= num_prompts:
            break
-        for j in range(batch_count):
-            inference_start = time.time()
-            # Use warmup mode here since non-warmup mode will save image to disk.
-            images, pipeline_time = pipeline.run(
-                [prompt] * batch_size,
-                [negative_prompt] * batch_size,
-                height,
-                width,
-                denoising_steps=steps,
-                guidance=7.5,
-                seed=123,
-            )
-            inference_end = time.time()
-            latency = inference_end - inference_start
-            latency_list.append(latency)
-            print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
-            for k, image in enumerate(images):
-                image.save(f"{image_filename_prefix}_{i}_{j}_{k}.jpg")
+        inference_start = time.time()
+        # Use warmup mode here since non-warmup mode will save image to disk.
+        images, pipeline_time = pipeline.run(
+            [prompt] * batch_size,
+            [negative_prompt] * batch_size,
+            height,
+            width,
+            denoising_steps=steps,
+            guidance=7.5,
+            seed=123,
+        )
+        inference_end = time.time()
+        latency = inference_end - inference_start
+        latency_list.append(latency)
+        print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
+        for k, image in enumerate(images):
+            image.save(f"{image_filename_prefix}_{i}_{k}.jpg")

    pipeline.teardown()

@ -671,9 +712,8 @@ def run_tensorrt_static(
    pipeline.load_resources(height, width, batch_size)

    def warmup():
-        pipeline.run(
-            ["warm up"] * batch_size, ["negative"] * batch_size, height, width, denoising_steps=steps, warmup=True
-        )
+        prompt, negative = warmup_prompts()
+        pipeline.run([prompt] * batch_size, [negative] * batch_size, height, width, denoising_steps=steps)

    # Run warm up, and measure GPU memory of two runs
    # The first run has algo search so it might need more memory
@ -689,24 +729,22 @@ def run_tensorrt_static(
    for i, prompt in enumerate(prompts):
        if i >= num_prompts:
            break
-        for j in range(batch_count):
-            inference_start = time.time()
-            # Use warmup mode here since non-warmup mode will save image to disk.
-            images, pipeline_time = pipeline.run(
-                [prompt] * batch_size,
-                [negative_prompt] * batch_size,
-                height,
-                width,
-                denoising_steps=steps,
-                guidance=7.5,
-                seed=123,
-            )
-            inference_end = time.time()
-            latency = inference_end - inference_start
-            latency_list.append(latency)
-            print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
-            for k, image in enumerate(images):
-                image.save(f"{image_filename_prefix}_{i}_{j}_{k}.jpg")
+        inference_start = time.time()
+        # Use warmup mode here since non-warmup mode will save image to disk.
+        images, pipeline_time = pipeline.run(
+            [prompt] * batch_size,
+            [negative_prompt] * batch_size,
+            height,
+            width,
+            denoising_steps=steps,
+            seed=123,
+        )
+        inference_end = time.time()
+        latency = inference_end - inference_start
+        latency_list.append(latency)
+        print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
+        for k, image in enumerate(images):
+            image.save(f"{image_filename_prefix}_{i}_{k}.jpg")

    pipeline.teardown()

@ -828,7 +866,8 @@ def run_tensorrt_static_xl(
        )

    def warmup():
-        run_sd_xl_inference(["warm up"] * batch_size, ["negative"] * batch_size)
+        prompt, negative = warmup_prompts()
+        run_sd_xl_inference([prompt] * batch_size, [negative] * batch_size)

    # Run warm up, and measure GPU memory of two runs
    # The first run has algo search so it might need more memory
@ -845,20 +884,15 @@ def run_tensorrt_static_xl(
    for i, prompt in enumerate(prompts):
        if i >= num_prompts:
            break
-        for j in range(batch_count):
-            inference_start = time.time()
-            # Use warmup mode here since non-warmup mode will save image to disk.
-            if nvtx_profile:
-                cudart.cudaProfilerStart()
-            images, pipeline_time = run_sd_xl_inference([prompt] * batch_size, [negative_prompt] * batch_size, seed=123)
-            if nvtx_profile:
-                cudart.cudaProfilerStop()
-            inference_end = time.time()
-            latency = inference_end - inference_start
-            latency_list.append(latency)
-            print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
-            for k, image in enumerate(images):
-                image.save(f"{image_filename_prefix}_{i}_{j}_{k}.png")
+        inference_start = time.time()
+        # Use warmup mode here since non-warmup mode will save image to disk.
+        images, pipeline_time = run_sd_xl_inference([prompt] * batch_size, [negative_prompt] * batch_size, seed=123)
+        inference_end = time.time()
+        latency = inference_end - inference_start
+        latency_list.append(latency)
+        print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
+        for k, image in enumerate(images):
+            image.save(f"{image_filename_prefix}_{i}_{k}.png")

    pipeline.teardown()

@ -911,8 +945,6 @@ def run_ort_trt_xl(
        opt_batch_size=batch_size,
    )

-    from cuda import cudart
-
    assert batch_size <= max_batch_size

    pipeline.load_resources(height, width, batch_size)
@ -929,7 +961,8 @@ def run_ort_trt_xl(
        )

    def warmup():
-        run_sd_xl_inference(["warm up"] * batch_size, ["negative"] * batch_size)
+        prompt, negative = warmup_prompts()
+        run_sd_xl_inference([prompt] * batch_size, [negative] * batch_size)

    # Run warm up, and measure GPU memory of two runs
    # The first run has algo search so it might need more memory
@ -946,22 +979,17 @@ def run_ort_trt_xl(
    for i, prompt in enumerate(prompts):
        if i >= num_prompts:
            break
-        for j in range(batch_count):
-            inference_start = time.time()
-            # Use warmup mode here since non-warmup mode will save image to disk.
-            if nvtx_profile:
-                cudart.cudaProfilerStart()
-            images, pipeline_time = run_sd_xl_inference([prompt] * batch_size, [negative_prompt] * batch_size, seed=123)
-            if nvtx_profile:
-                cudart.cudaProfilerStop()
-            inference_end = time.time()
-            latency = inference_end - inference_start
-            latency_list.append(latency)
-            print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
-            for k, image in enumerate(images):
-                filename = f"{image_filename_prefix}_{i}_{j}_{k}.png"
-                image.save(filename)
-                print("Image saved to", filename)
+        inference_start = time.time()
+        # Use warmup mode here since non-warmup mode will save image to disk.
+        images, pipeline_time = run_sd_xl_inference([prompt] * batch_size, [negative_prompt] * batch_size, seed=123)
+        inference_end = time.time()
+        latency = inference_end - inference_start
+        latency_list.append(latency)
+        print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
+        for k, image in enumerate(images):
+            filename = f"{image_filename_prefix}_{i}_{k}.png"
+            image.save(filename)
+            print("Image saved to", filename)

    pipeline.teardown()

@ -1137,6 +1165,14 @@ def parse_arguments():
    )
    parser.set_defaults(use_xformers=False)

+    parser.add_argument(
+        "--use_io_binding",
+        required=False,
+        action="store_true",
+        help="Use I/O Binding for Optimum.",
+    )
+    parser.set_defaults(use_io_binding=False)
+
    parser.add_argument(
        "-b",
        "--batch_size",
@ -1176,8 +1212,8 @@ def parse_arguments():
        "--num_prompts",
        required=False,
        type=int,
-        default=1,
-        help="Number of prompts. Default is 1.",
+        default=10,
+        help="Number of prompts. Default is 10.",
    )

    parser.add_argument(
@ -1312,6 +1348,7 @@ def main():
            batch_count=args.batch_count,
            start_memory=start_memory,
            memory_monitor_type=memory_monitor_type,
+            use_io_binding=args.use_io_binding,
        )
    elif args.engine == "onnxruntime":
        assert args.pipeline and os.path.isdir(