From 77b45c6503e9fd5502d288337370faef30f6e34f Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Fri, 14 Jul 2023 10:37:00 -0700 Subject: [PATCH] Add Stable Diffusion Benchmark on A100-PCIE-80GB (#16702) 0(1) Fix a bug in https://github.com/microsoft/onnxruntime/pull/16560 that UNet shall be set fp16 flag. (2) Remove wget in requirements since it is no longer needed. (3) Add benchmark numbers in A100-PCIE-80GB. Note that CUDA EP have issue to run in batch size 4 so the number is not added. --- .../transformers/models/stable_diffusion/README.md | 12 ++++++++++++ .../stable_diffusion/onnxruntime_cuda_txt2img.py | 3 +-- .../stable_diffusion/onnxruntime_tensorrt_txt2img.py | 1 + .../stable_diffusion/requirements-tensorrt.txt | 1 - 4 files changed, 14 insertions(+), 3 deletions(-) diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md b/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md index 19d205fb35..d184224317 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md @@ -277,6 +277,18 @@ Common settings for below test results: | torch | 2.0.0+cu117 | default | 16 | 14.8 | 32,306 | 16,520 | | torch | 2.0.0+cu117 | compile | 16 | 12.6 | 32,636 | 16,898 | +#### Results of A100-PCIE-80GB (Ubuntu 20.04) +| engine | version | provider | batch size | average latency | first run memory MB | second run memory MB | +| ----------- | ----------------------- | --------------------- | ---------- | --------------- | ------------------- | -------------------- | +| tensorrt | 8.6.1 | default | 1 | 1.00 | 9,056 | 9,056 | +| onnxruntime | 1.16.0 nightly | tensorrt | 1 | 1.09 | 11,250 | 11,250 | +| onnxruntime | 1.16.0 nightly | tensorrt (cuda graph) | 1 | 0.96 | 11,382 | 11,382 | +| onnxruntime | 1.16.0 nightly | cuda | 1 | 1.11 | 4,760 | 5,144 | +| onnxruntime | 1.16.0 nightly | cuda (cuda graph) | 1 | 1.04 | 5,230 | 5,390 | +| tensorrt | 8.6.1 | default | 4 | 3.39 | 9,072 | 9,072 | +| onnxruntime | 1.16.0 nightly | tensorrt | 4 | 3.60 | 11,266 | 11,266 | +| onnxruntime | 1.16.0 nightly | tensorrt (cuda graph) | 4 | 3.43 | 11,428 | 11,428 | + #### Results of V100-PCIE-16GB (Ubuntu 20.04) Results from Standard_NC6s_v3 Azure virtual machine: diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/onnxruntime_cuda_txt2img.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/onnxruntime_cuda_txt2img.py index cecfc976b3..bd29e3e42c 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/onnxruntime_cuda_txt2img.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/onnxruntime_cuda_txt2img.py @@ -27,8 +27,7 @@ Modifications: (1) Create ONNX Runtime session (2) Use I/O Binding of ONNX Runti Installation instructions pip install torch==1.13.1+cu117 --extra-index-url https://download.pytorch.org/whl/cu117 pip install --upgrade transformers diffusers>=0.16.0 -pip install --upgrade tensorrt>=8.6.1 -pip install --upgrade polygraphy>=0.47.0 onnx-graphsurgeon --extra-index-url https://pypi.ngc.nvidia.com +pip install numpy>=1.24.1 onnx>=1.13.0 coloredlogs protobuf==3.20.3 psutil sympy pip install onnxruntime-gpu """ diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/onnxruntime_tensorrt_txt2img.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/onnxruntime_tensorrt_txt2img.py index 7c29fd4af1..80f257db29 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/onnxruntime_tensorrt_txt2img.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/onnxruntime_tensorrt_txt2img.py @@ -644,6 +644,7 @@ class OnnxruntimeTensorRTStableDiffusionPipeline(StableDiffusionPipeline): self.models["unet"] = UNet( self.unet, + fp16=True, device=self.torch_device, max_batch_size=self.max_batch_size, embedding_dim=self.embedding_dim, diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-tensorrt.txt b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-tensorrt.txt index e95fa6691f..567f39c011 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-tensorrt.txt +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-tensorrt.txt @@ -11,7 +11,6 @@ sympy tensorrt>=8.6.1 onnxruntime-gpu>=1.15.1 py3nvml -wget # cuda-python version shall be compatible with CUDA version of torch and onnxruntime-gpu cuda-python==11.7.0 #To export onnx of stable diffusion, please install PyTorch 1.13.1+cu117