[ROCm] update Stable Diffusion benchmark to support ROCm EP (#15094)

Update Stable Diffusion benchmark to support ROCm EP
This commit is contained in:
PeixuanZuo 2023-03-29 15:19:52 +08:00 committed by GitHub
parent 85948d6bc6
commit a6279d4cfb
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 110 additions and 13 deletions

View file

@ -16,6 +16,11 @@ SD_MODELS = {
"2.1": "stabilityai/stable-diffusion-2-1",
}
PROVIDERS = {
"cuda": "CUDAExecutionProvider",
"rocm": "ROCMExecutionProvider",
}
def example_prompts():
prompts = [
@ -187,7 +192,16 @@ def get_image_filename_prefix(engine: str, model_name: str, batch_size: int, dis
def run_ort_pipeline(
pipe, batch_size: int, image_filename_prefix: str, height, width, steps, num_prompts, batch_count, start_memory
pipe,
batch_size: int,
image_filename_prefix: str,
height,
width,
steps,
num_prompts,
batch_count,
start_memory,
enable_mem_measure,
):
from diffusers import OnnxStableDiffusionPipeline
@ -199,8 +213,11 @@ def run_ort_pipeline(
pipe("warm up", height, width, num_inference_steps=steps, num_images_per_prompt=batch_size)
# Run warm up, and measure GPU memory of two runs (The first run has cuDNN algo search so it might need more memory)
first_run_memory = measure_gpu_memory(warmup, start_memory)
second_run_memory = measure_gpu_memory(warmup, start_memory)
first_run_memory = measure_gpu_memory(warmup, start_memory) if enable_mem_measure else -1
second_run_memory = measure_gpu_memory(warmup, start_memory) if enable_mem_measure else -1
if not enable_mem_measure:
warmup()
latency_list = []
for i, prompt in enumerate(prompts):
@ -243,19 +260,31 @@ def run_ort_pipeline(
def run_torch_pipeline(
pipe, batch_size: int, image_filename_prefix: str, height, width, steps, num_prompts, batch_count, start_memory
pipe,
batch_size: int,
image_filename_prefix: str,
height,
width,
steps,
num_prompts,
batch_count,
start_memory,
enable_mem_measure,
):
import torch
prompts = example_prompts()
# total 2 runs of warm up, and measure GPU memory
# total 2 runs of warm up, and measure GPU memory for CUDA EP
def warmup():
pipe("warm up", height, width, num_inference_steps=steps, num_images_per_prompt=batch_size)
# Run warm up, and measure GPU memory of two runs (The first run has cuDNN algo search so it might need more memory)
first_run_memory = measure_gpu_memory(warmup, start_memory)
second_run_memory = measure_gpu_memory(warmup, start_memory)
first_run_memory = measure_gpu_memory(warmup, start_memory) if enable_mem_measure else -1
second_run_memory = measure_gpu_memory(warmup, start_memory) if enable_mem_measure else -1
if not enable_mem_measure:
warmup()
torch.set_grad_enabled(False)
@ -313,6 +342,7 @@ def run_ort(
num_prompts,
batch_count,
start_memory,
enable_mem_measure,
):
load_start = time.time()
pipe = get_ort_pipeline(model_name, directory, provider, disable_safety_checker)
@ -321,7 +351,16 @@ def run_ort(
image_filename_prefix = get_image_filename_prefix("ort", model_name, batch_size, disable_safety_checker)
result = run_ort_pipeline(
pipe, batch_size, image_filename_prefix, height, width, steps, num_prompts, batch_count, start_memory
pipe,
batch_size,
image_filename_prefix,
height,
width,
steps,
num_prompts,
batch_count,
start_memory,
enable_mem_measure,
)
result.update(
@ -347,6 +386,7 @@ def run_torch(
num_prompts,
batch_count,
start_memory,
enable_mem_measure,
):
import torch
@ -365,11 +405,29 @@ def run_torch(
if not enable_torch_compile:
with torch.inference_mode():
result = run_torch_pipeline(
pipe, batch_size, image_filename_prefix, height, width, steps, num_prompts, batch_count, start_memory
pipe,
batch_size,
image_filename_prefix,
height,
width,
steps,
num_prompts,
batch_count,
start_memory,
enable_mem_measure,
)
else:
result = run_torch_pipeline(
pipe, batch_size, image_filename_prefix, height, width, steps, num_prompts, batch_count, start_memory
pipe,
batch_size,
image_filename_prefix,
height,
width,
steps,
num_prompts,
batch_count,
start_memory,
enable_mem_measure,
)
result.update(
@ -396,6 +454,16 @@ def parse_arguments():
help="Engines to benchmark. Default is onnxruntime.",
)
parser.add_argument(
"-r",
"--provider",
required=False,
type=str,
default="cuda",
choices=list(PROVIDERS.keys()),
help="Provider to benchmark. Default is CUDAExecutionProvider.",
)
parser.add_argument(
"-v",
"--version",
@ -500,14 +568,16 @@ def main():
args = parse_arguments()
print(args)
start_memory = measure_gpu_memory(None)
enable_mem_measure = args.provider == "cuda"
start_memory = measure_gpu_memory(None) if enable_mem_measure else -1
print("GPU memory used before loading models:", start_memory)
sd_model = SD_MODELS[args.version]
provider = PROVIDERS[args.provider]
if args.engine == "onnxruntime":
assert args.pipeline, "--pipeline should be specified for onnxruntime engine"
provider = "CUDAExecutionProvider"
result = run_ort(
sd_model,
args.pipeline,
@ -520,6 +590,7 @@ def main():
args.num_prompts,
args.batch_count,
start_memory,
enable_mem_measure,
)
else:
result = run_torch(
@ -534,6 +605,7 @@ def main():
args.num_prompts,
args.batch_count,
start_memory,
enable_mem_measure,
)
print(result)

View file

@ -17,6 +17,9 @@
#
# If you are using nightly package (or built from source), you can force MultiHeadAttention to run in float32:
# python optimize_pipeline.py -i ./sd-v2-1 -o ./sd-v2-1-fp16 --float16 --force_fp32_ops unet:MultiHeadAttention
#
# ROCm EP doesn't support MultiHeadAttention, add --disable_attention to disable attention fusion:
# python optimize_pipeline.py -i ./sd-v1-5 -o ./sd-v1-5-fp16 --float16 --disable_attention
import argparse
import logging
@ -51,6 +54,7 @@ def optimize_sd_pipeline(
float16: bool,
force_fp32_ops: List[str],
enable_runtime_optimization: bool,
args,
):
"""Optimize onnx models used in stable diffusion onnx pipeline and optionally convert to float16.
@ -123,7 +127,8 @@ def optimize_sd_pipeline(
# Right now, onnxruntime does not save >2GB model so we use script to optimize unet instead.
logger.info(f"Optimize {onnx_model_path}...")
fusion_options = FusionOptions(model_type)
args.model_type = model_type
fusion_options = FusionOptions.parse(args)
if model_type in ["unet"]:
# Some optimizations are not available in v1.14 or older version: packed QKV and BiasAdd
@ -286,6 +291,8 @@ def parse_arguments():
)
parser.set_defaults(use_external_data_format=False)
FusionOptions.add_arguments(parser)
args = parser.parse_args()
return args
@ -303,6 +310,7 @@ def main():
args.float16,
args.force_fp32_ops,
args.inspect,
args,
)

View file

@ -0,0 +1,17 @@
transformers==4.26.0
numpy==1.24.1
accelerate==0.15.0
onnx==1.13.0
coloredlogs
packaging==23.0
protobuf==3.20.3
psutil==5.9.4
sympy==1.11.1
# Install diffusers from source
# git clone https://github.com/huggingface/diffusers.git
# cd diffusers && git checkout c4892f1855097a68703ca2e949aca15829526958
# pip install -e .
# Install onnxruntime-rocm or onnxruntime_training
# Build onnxruntime-rocm from source or install lastest onnxruntime_training rocm nightly python package