From 77b45c6503e9fd5502d288337370faef30f6e34f Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Fri, 14 Jul 2023 10:37:00 -0700
Subject: [PATCH] Add Stable Diffusion Benchmark on A100-PCIE-80GB (#16702)

0(1) Fix a bug in https://github.com/microsoft/onnxruntime/pull/16560
that UNet shall be set fp16 flag.
(2) Remove wget in requirements since it is no longer needed.
(3) Add benchmark numbers in A100-PCIE-80GB. Note that CUDA EP have
issue to run in batch size 4 so the number is not added.
---
 .../transformers/models/stable_diffusion/README.md   | 12 ++++++++++++
 .../stable_diffusion/onnxruntime_cuda_txt2img.py     |  3 +--
 .../stable_diffusion/onnxruntime_tensorrt_txt2img.py |  1 +
 .../stable_diffusion/requirements-tensorrt.txt       |  1 -
 4 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md b/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md
index 19d205fb35..d184224317 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md
@@ -277,6 +277,18 @@ Common settings for below test results:
 | torch       | 2.0.0+cu117             | default               | 16         | 14.8            | 32,306              | 16,520               |
 | torch       | 2.0.0+cu117             | compile               | 16         | 12.6            | 32,636              | 16,898               |
 
+#### Results of A100-PCIE-80GB (Ubuntu 20.04)
+| engine      | version                 | provider              | batch size | average latency | first run memory MB | second run memory MB |
+| ----------- | ----------------------- | --------------------- | ---------- | --------------- | ------------------- | -------------------- |
+| tensorrt    | 8.6.1                   | default               | 1          | 1.00            | 9,056               | 9,056                |
+| onnxruntime | 1.16.0 nightly          | tensorrt              | 1          | 1.09            | 11,250              | 11,250               |
+| onnxruntime | 1.16.0 nightly          | tensorrt (cuda graph) | 1          | 0.96            | 11,382              | 11,382               |
+| onnxruntime | 1.16.0 nightly          | cuda                  | 1          | 1.11            | 4,760               | 5,144                |
+| onnxruntime | 1.16.0 nightly          | cuda (cuda graph)     | 1          | 1.04            | 5,230               | 5,390                |
+| tensorrt    | 8.6.1                   | default               | 4          | 3.39            | 9,072               | 9,072                |
+| onnxruntime | 1.16.0 nightly          | tensorrt              | 4          | 3.60            | 11,266              | 11,266               |
+| onnxruntime | 1.16.0 nightly          | tensorrt (cuda graph) | 4          | 3.43            | 11,428              | 11,428               |
+
 #### Results of V100-PCIE-16GB (Ubuntu 20.04)
 
 Results from Standard_NC6s_v3 Azure virtual machine:
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/onnxruntime_cuda_txt2img.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/onnxruntime_cuda_txt2img.py
index cecfc976b3..bd29e3e42c 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/onnxruntime_cuda_txt2img.py
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/onnxruntime_cuda_txt2img.py
@@ -27,8 +27,7 @@ Modifications: (1) Create ONNX Runtime session (2) Use I/O Binding of ONNX Runti
 Installation instructions
 pip install torch==1.13.1+cu117 --extra-index-url https://download.pytorch.org/whl/cu117
 pip install --upgrade transformers diffusers>=0.16.0
-pip install --upgrade tensorrt>=8.6.1
-pip install --upgrade polygraphy>=0.47.0 onnx-graphsurgeon --extra-index-url https://pypi.ngc.nvidia.com
+pip install numpy>=1.24.1 onnx>=1.13.0 coloredlogs protobuf==3.20.3 psutil sympy
 pip install onnxruntime-gpu
 """
 
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/onnxruntime_tensorrt_txt2img.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/onnxruntime_tensorrt_txt2img.py
index 7c29fd4af1..80f257db29 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/onnxruntime_tensorrt_txt2img.py
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/onnxruntime_tensorrt_txt2img.py
@@ -644,6 +644,7 @@ class OnnxruntimeTensorRTStableDiffusionPipeline(StableDiffusionPipeline):
 
         self.models["unet"] = UNet(
             self.unet,
+            fp16=True,
             device=self.torch_device,
             max_batch_size=self.max_batch_size,
             embedding_dim=self.embedding_dim,
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-tensorrt.txt b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-tensorrt.txt
index e95fa6691f..567f39c011 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-tensorrt.txt
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-tensorrt.txt
@@ -11,7 +11,6 @@ sympy
 tensorrt>=8.6.1
 onnxruntime-gpu>=1.15.1
 py3nvml
-wget
 # cuda-python version shall be compatible with CUDA version of torch and onnxruntime-gpu
 cuda-python==11.7.0
 #To export onnx of stable diffusion, please install PyTorch 1.13.1+cu117