mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-31 23:27:43 +00:00
[ROCm] update Stable Diffusion benchmark to support ROCm EP (#15094)
Update Stable Diffusion benchmark to support ROCm EP
This commit is contained in:
parent
85948d6bc6
commit
a6279d4cfb
3 changed files with 110 additions and 13 deletions
|
|
@ -16,6 +16,11 @@ SD_MODELS = {
|
|||
"2.1": "stabilityai/stable-diffusion-2-1",
|
||||
}
|
||||
|
||||
PROVIDERS = {
|
||||
"cuda": "CUDAExecutionProvider",
|
||||
"rocm": "ROCMExecutionProvider",
|
||||
}
|
||||
|
||||
|
||||
def example_prompts():
|
||||
prompts = [
|
||||
|
|
@ -187,7 +192,16 @@ def get_image_filename_prefix(engine: str, model_name: str, batch_size: int, dis
|
|||
|
||||
|
||||
def run_ort_pipeline(
|
||||
pipe, batch_size: int, image_filename_prefix: str, height, width, steps, num_prompts, batch_count, start_memory
|
||||
pipe,
|
||||
batch_size: int,
|
||||
image_filename_prefix: str,
|
||||
height,
|
||||
width,
|
||||
steps,
|
||||
num_prompts,
|
||||
batch_count,
|
||||
start_memory,
|
||||
enable_mem_measure,
|
||||
):
|
||||
from diffusers import OnnxStableDiffusionPipeline
|
||||
|
||||
|
|
@ -199,8 +213,11 @@ def run_ort_pipeline(
|
|||
pipe("warm up", height, width, num_inference_steps=steps, num_images_per_prompt=batch_size)
|
||||
|
||||
# Run warm up, and measure GPU memory of two runs (The first run has cuDNN algo search so it might need more memory)
|
||||
first_run_memory = measure_gpu_memory(warmup, start_memory)
|
||||
second_run_memory = measure_gpu_memory(warmup, start_memory)
|
||||
first_run_memory = measure_gpu_memory(warmup, start_memory) if enable_mem_measure else -1
|
||||
second_run_memory = measure_gpu_memory(warmup, start_memory) if enable_mem_measure else -1
|
||||
|
||||
if not enable_mem_measure:
|
||||
warmup()
|
||||
|
||||
latency_list = []
|
||||
for i, prompt in enumerate(prompts):
|
||||
|
|
@ -243,19 +260,31 @@ def run_ort_pipeline(
|
|||
|
||||
|
||||
def run_torch_pipeline(
|
||||
pipe, batch_size: int, image_filename_prefix: str, height, width, steps, num_prompts, batch_count, start_memory
|
||||
pipe,
|
||||
batch_size: int,
|
||||
image_filename_prefix: str,
|
||||
height,
|
||||
width,
|
||||
steps,
|
||||
num_prompts,
|
||||
batch_count,
|
||||
start_memory,
|
||||
enable_mem_measure,
|
||||
):
|
||||
import torch
|
||||
|
||||
prompts = example_prompts()
|
||||
|
||||
# total 2 runs of warm up, and measure GPU memory
|
||||
# total 2 runs of warm up, and measure GPU memory for CUDA EP
|
||||
def warmup():
|
||||
pipe("warm up", height, width, num_inference_steps=steps, num_images_per_prompt=batch_size)
|
||||
|
||||
# Run warm up, and measure GPU memory of two runs (The first run has cuDNN algo search so it might need more memory)
|
||||
first_run_memory = measure_gpu_memory(warmup, start_memory)
|
||||
second_run_memory = measure_gpu_memory(warmup, start_memory)
|
||||
first_run_memory = measure_gpu_memory(warmup, start_memory) if enable_mem_measure else -1
|
||||
second_run_memory = measure_gpu_memory(warmup, start_memory) if enable_mem_measure else -1
|
||||
|
||||
if not enable_mem_measure:
|
||||
warmup()
|
||||
|
||||
torch.set_grad_enabled(False)
|
||||
|
||||
|
|
@ -313,6 +342,7 @@ def run_ort(
|
|||
num_prompts,
|
||||
batch_count,
|
||||
start_memory,
|
||||
enable_mem_measure,
|
||||
):
|
||||
load_start = time.time()
|
||||
pipe = get_ort_pipeline(model_name, directory, provider, disable_safety_checker)
|
||||
|
|
@ -321,7 +351,16 @@ def run_ort(
|
|||
|
||||
image_filename_prefix = get_image_filename_prefix("ort", model_name, batch_size, disable_safety_checker)
|
||||
result = run_ort_pipeline(
|
||||
pipe, batch_size, image_filename_prefix, height, width, steps, num_prompts, batch_count, start_memory
|
||||
pipe,
|
||||
batch_size,
|
||||
image_filename_prefix,
|
||||
height,
|
||||
width,
|
||||
steps,
|
||||
num_prompts,
|
||||
batch_count,
|
||||
start_memory,
|
||||
enable_mem_measure,
|
||||
)
|
||||
|
||||
result.update(
|
||||
|
|
@ -347,6 +386,7 @@ def run_torch(
|
|||
num_prompts,
|
||||
batch_count,
|
||||
start_memory,
|
||||
enable_mem_measure,
|
||||
):
|
||||
import torch
|
||||
|
||||
|
|
@ -365,11 +405,29 @@ def run_torch(
|
|||
if not enable_torch_compile:
|
||||
with torch.inference_mode():
|
||||
result = run_torch_pipeline(
|
||||
pipe, batch_size, image_filename_prefix, height, width, steps, num_prompts, batch_count, start_memory
|
||||
pipe,
|
||||
batch_size,
|
||||
image_filename_prefix,
|
||||
height,
|
||||
width,
|
||||
steps,
|
||||
num_prompts,
|
||||
batch_count,
|
||||
start_memory,
|
||||
enable_mem_measure,
|
||||
)
|
||||
else:
|
||||
result = run_torch_pipeline(
|
||||
pipe, batch_size, image_filename_prefix, height, width, steps, num_prompts, batch_count, start_memory
|
||||
pipe,
|
||||
batch_size,
|
||||
image_filename_prefix,
|
||||
height,
|
||||
width,
|
||||
steps,
|
||||
num_prompts,
|
||||
batch_count,
|
||||
start_memory,
|
||||
enable_mem_measure,
|
||||
)
|
||||
|
||||
result.update(
|
||||
|
|
@ -396,6 +454,16 @@ def parse_arguments():
|
|||
help="Engines to benchmark. Default is onnxruntime.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-r",
|
||||
"--provider",
|
||||
required=False,
|
||||
type=str,
|
||||
default="cuda",
|
||||
choices=list(PROVIDERS.keys()),
|
||||
help="Provider to benchmark. Default is CUDAExecutionProvider.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-v",
|
||||
"--version",
|
||||
|
|
@ -500,14 +568,16 @@ def main():
|
|||
args = parse_arguments()
|
||||
print(args)
|
||||
|
||||
start_memory = measure_gpu_memory(None)
|
||||
enable_mem_measure = args.provider == "cuda"
|
||||
|
||||
start_memory = measure_gpu_memory(None) if enable_mem_measure else -1
|
||||
print("GPU memory used before loading models:", start_memory)
|
||||
|
||||
sd_model = SD_MODELS[args.version]
|
||||
provider = PROVIDERS[args.provider]
|
||||
if args.engine == "onnxruntime":
|
||||
assert args.pipeline, "--pipeline should be specified for onnxruntime engine"
|
||||
|
||||
provider = "CUDAExecutionProvider"
|
||||
result = run_ort(
|
||||
sd_model,
|
||||
args.pipeline,
|
||||
|
|
@ -520,6 +590,7 @@ def main():
|
|||
args.num_prompts,
|
||||
args.batch_count,
|
||||
start_memory,
|
||||
enable_mem_measure,
|
||||
)
|
||||
else:
|
||||
result = run_torch(
|
||||
|
|
@ -534,6 +605,7 @@ def main():
|
|||
args.num_prompts,
|
||||
args.batch_count,
|
||||
start_memory,
|
||||
enable_mem_measure,
|
||||
)
|
||||
|
||||
print(result)
|
||||
|
|
|
|||
|
|
@ -17,6 +17,9 @@
|
|||
#
|
||||
# If you are using nightly package (or built from source), you can force MultiHeadAttention to run in float32:
|
||||
# python optimize_pipeline.py -i ./sd-v2-1 -o ./sd-v2-1-fp16 --float16 --force_fp32_ops unet:MultiHeadAttention
|
||||
#
|
||||
# ROCm EP doesn't support MultiHeadAttention, add --disable_attention to disable attention fusion:
|
||||
# python optimize_pipeline.py -i ./sd-v1-5 -o ./sd-v1-5-fp16 --float16 --disable_attention
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
|
|
@ -51,6 +54,7 @@ def optimize_sd_pipeline(
|
|||
float16: bool,
|
||||
force_fp32_ops: List[str],
|
||||
enable_runtime_optimization: bool,
|
||||
args,
|
||||
):
|
||||
"""Optimize onnx models used in stable diffusion onnx pipeline and optionally convert to float16.
|
||||
|
||||
|
|
@ -123,7 +127,8 @@ def optimize_sd_pipeline(
|
|||
# Right now, onnxruntime does not save >2GB model so we use script to optimize unet instead.
|
||||
logger.info(f"Optimize {onnx_model_path}...")
|
||||
|
||||
fusion_options = FusionOptions(model_type)
|
||||
args.model_type = model_type
|
||||
fusion_options = FusionOptions.parse(args)
|
||||
|
||||
if model_type in ["unet"]:
|
||||
# Some optimizations are not available in v1.14 or older version: packed QKV and BiasAdd
|
||||
|
|
@ -286,6 +291,8 @@ def parse_arguments():
|
|||
)
|
||||
parser.set_defaults(use_external_data_format=False)
|
||||
|
||||
FusionOptions.add_arguments(parser)
|
||||
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
|
@ -303,6 +310,7 @@ def main():
|
|||
args.float16,
|
||||
args.force_fp32_ops,
|
||||
args.inspect,
|
||||
args,
|
||||
)
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,17 @@
|
|||
transformers==4.26.0
|
||||
numpy==1.24.1
|
||||
accelerate==0.15.0
|
||||
onnx==1.13.0
|
||||
coloredlogs
|
||||
packaging==23.0
|
||||
protobuf==3.20.3
|
||||
psutil==5.9.4
|
||||
sympy==1.11.1
|
||||
|
||||
# Install diffusers from source
|
||||
# git clone https://github.com/huggingface/diffusers.git
|
||||
# cd diffusers && git checkout c4892f1855097a68703ca2e949aca15829526958
|
||||
# pip install -e .
|
||||
|
||||
# Install onnxruntime-rocm or onnxruntime_training
|
||||
# Build onnxruntime-rocm from source or install lastest onnxruntime_training rocm nightly python package
|
||||
Loading…
Reference in a new issue