[ROCm] update Stable Diffusion benchmark to support ROCm EP (#15094)

Update Stable Diffusion benchmark to support ROCm EP
2026-07-16 18:31:27 +00:00 · 2023-03-29 15:19:52 +08:00 · 2023-03-29 15:19:52 +08:00 · a6279d4cfb
commit a6279d4cfb
parent 85948d6bc6
3 changed files with 110 additions and 13 deletions
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py
@ -16,6 +16,11 @@ SD_MODELS = {
    "2.1": "stabilityai/stable-diffusion-2-1",
 }

+PROVIDERS = {
+    "cuda": "CUDAExecutionProvider",
+    "rocm": "ROCMExecutionProvider",
+}
+

 def example_prompts():
    prompts = [
@ -187,7 +192,16 @@ def get_image_filename_prefix(engine: str, model_name: str, batch_size: int, dis


 def run_ort_pipeline(
-    pipe, batch_size: int, image_filename_prefix: str, height, width, steps, num_prompts, batch_count, start_memory
+    pipe,
+    batch_size: int,
+    image_filename_prefix: str,
+    height,
+    width,
+    steps,
+    num_prompts,
+    batch_count,
+    start_memory,
+    enable_mem_measure,
 ):
    from diffusers import OnnxStableDiffusionPipeline

@ -199,8 +213,11 @@ def run_ort_pipeline(
        pipe("warm up", height, width, num_inference_steps=steps, num_images_per_prompt=batch_size)

    # Run warm up, and measure GPU memory of two runs (The first run has cuDNN algo search so it might need more memory)
-    first_run_memory = measure_gpu_memory(warmup, start_memory)
-    second_run_memory = measure_gpu_memory(warmup, start_memory)
+    first_run_memory = measure_gpu_memory(warmup, start_memory) if enable_mem_measure else -1
+    second_run_memory = measure_gpu_memory(warmup, start_memory) if enable_mem_measure else -1
+
+    if not enable_mem_measure:
+        warmup()

    latency_list = []
    for i, prompt in enumerate(prompts):
@ -243,19 +260,31 @@ def run_ort_pipeline(


 def run_torch_pipeline(
-    pipe, batch_size: int, image_filename_prefix: str, height, width, steps, num_prompts, batch_count, start_memory
+    pipe,
+    batch_size: int,
+    image_filename_prefix: str,
+    height,
+    width,
+    steps,
+    num_prompts,
+    batch_count,
+    start_memory,
+    enable_mem_measure,
 ):
    import torch

    prompts = example_prompts()

-    # total 2 runs of warm up, and measure GPU memory
+    # total 2 runs of warm up, and measure GPU memory for CUDA EP
    def warmup():
        pipe("warm up", height, width, num_inference_steps=steps, num_images_per_prompt=batch_size)

    # Run warm up, and measure GPU memory of two runs (The first run has cuDNN algo search so it might need more memory)
-    first_run_memory = measure_gpu_memory(warmup, start_memory)
-    second_run_memory = measure_gpu_memory(warmup, start_memory)
+    first_run_memory = measure_gpu_memory(warmup, start_memory) if enable_mem_measure else -1
+    second_run_memory = measure_gpu_memory(warmup, start_memory) if enable_mem_measure else -1
+
+    if not enable_mem_measure:
+        warmup()

    torch.set_grad_enabled(False)

@ -313,6 +342,7 @@ def run_ort(
    num_prompts,
    batch_count,
    start_memory,
+    enable_mem_measure,
 ):
    load_start = time.time()
    pipe = get_ort_pipeline(model_name, directory, provider, disable_safety_checker)
@ -321,7 +351,16 @@ def run_ort(

    image_filename_prefix = get_image_filename_prefix("ort", model_name, batch_size, disable_safety_checker)
    result = run_ort_pipeline(
-        pipe, batch_size, image_filename_prefix, height, width, steps, num_prompts, batch_count, start_memory
+        pipe,
+        batch_size,
+        image_filename_prefix,
+        height,
+        width,
+        steps,
+        num_prompts,
+        batch_count,
+        start_memory,
+        enable_mem_measure,
    )

    result.update(
@ -347,6 +386,7 @@ def run_torch(
    num_prompts,
    batch_count,
    start_memory,
+    enable_mem_measure,
 ):
    import torch

@ -365,11 +405,29 @@ def run_torch(
    if not enable_torch_compile:
        with torch.inference_mode():
            result = run_torch_pipeline(
-                pipe, batch_size, image_filename_prefix, height, width, steps, num_prompts, batch_count, start_memory
+                pipe,
+                batch_size,
+                image_filename_prefix,
+                height,
+                width,
+                steps,
+                num_prompts,
+                batch_count,
+                start_memory,
+                enable_mem_measure,
            )
    else:
        result = run_torch_pipeline(
-            pipe, batch_size, image_filename_prefix, height, width, steps, num_prompts, batch_count, start_memory
+            pipe,
+            batch_size,
+            image_filename_prefix,
+            height,
+            width,
+            steps,
+            num_prompts,
+            batch_count,
+            start_memory,
+            enable_mem_measure,
        )

    result.update(
@ -396,6 +454,16 @@ def parse_arguments():
        help="Engines to benchmark. Default is onnxruntime.",
    )

+    parser.add_argument(
+        "-r",
+        "--provider",
+        required=False,
+        type=str,
+        default="cuda",
+        choices=list(PROVIDERS.keys()),
+        help="Provider to benchmark. Default is CUDAExecutionProvider.",
+    )
+
    parser.add_argument(
        "-v",
        "--version",
@ -500,14 +568,16 @@ def main():
    args = parse_arguments()
    print(args)

-    start_memory = measure_gpu_memory(None)
+    enable_mem_measure = args.provider == "cuda"
+
+    start_memory = measure_gpu_memory(None) if enable_mem_measure else -1
    print("GPU memory used before loading models:", start_memory)

    sd_model = SD_MODELS[args.version]
+    provider = PROVIDERS[args.provider]
    if args.engine == "onnxruntime":
        assert args.pipeline, "--pipeline should be specified for onnxruntime engine"

-        provider = "CUDAExecutionProvider"
        result = run_ort(
            sd_model,
            args.pipeline,
@ -520,6 +590,7 @@ def main():
            args.num_prompts,
            args.batch_count,
            start_memory,
+            enable_mem_measure,
        )
    else:
        result = run_torch(
@ -534,6 +605,7 @@ def main():
            args.num_prompts,
            args.batch_count,
            start_memory,
+            enable_mem_measure,
        )

    print(result)
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/optimize_pipeline.py
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/optimize_pipeline.py
@ -17,6 +17,9 @@
 #
 # If you are using nightly package (or built from source), you can force MultiHeadAttention to run in float32:
 #    python optimize_pipeline.py -i ./sd-v2-1 -o ./sd-v2-1-fp16 --float16 --force_fp32_ops unet:MultiHeadAttention
+#
+# ROCm EP doesn't support MultiHeadAttention, add --disable_attention to disable attention fusion:
+#    python optimize_pipeline.py -i ./sd-v1-5 -o ./sd-v1-5-fp16 --float16 --disable_attention

 import argparse
 import logging
@ -51,6 +54,7 @@ def optimize_sd_pipeline(
    float16: bool,
    force_fp32_ops: List[str],
    enable_runtime_optimization: bool,
+    args,
 ):
    """Optimize onnx models used in stable diffusion onnx pipeline and optionally convert to float16.

@ -123,7 +127,8 @@ def optimize_sd_pipeline(
        # Right now, onnxruntime does not save >2GB model so we use script to optimize unet instead.
        logger.info(f"Optimize {onnx_model_path}...")

-        fusion_options = FusionOptions(model_type)
+        args.model_type = model_type
+        fusion_options = FusionOptions.parse(args)

        if model_type in ["unet"]:
            # Some optimizations are not available in v1.14 or older version: packed QKV and BiasAdd
@ -286,6 +291,8 @@ def parse_arguments():
    )
    parser.set_defaults(use_external_data_format=False)

+    FusionOptions.add_arguments(parser)
+
    args = parser.parse_args()
    return args

@ -303,6 +310,7 @@ def main():
        args.float16,
        args.force_fp32_ops,
        args.inspect,
+        args,
    )


--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-rocm.txt
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-rocm.txt
@ -0,0 +1,17 @@
+transformers==4.26.0
+numpy==1.24.1
+accelerate==0.15.0
+onnx==1.13.0
+coloredlogs
+packaging==23.0
+protobuf==3.20.3
+psutil==5.9.4
+sympy==1.11.1
+
+# Install diffusers from source
+# git clone  https://github.com/huggingface/diffusers.git
+# cd diffusers && git checkout c4892f1855097a68703ca2e949aca15829526958
+# pip install -e .
+
+# Install onnxruntime-rocm or onnxruntime_training
+# Build onnxruntime-rocm from source or install lastest onnxruntime_training rocm nightly python package