From e0c1fa35a87657e3e7945fbc7422d69226531fe8 Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Tue, 9 May 2023 15:29:13 -0700
Subject: [PATCH] update stable diffusion script and doc (#15846)

### Description
Update script:
(1) change some float16 verbose logging to debug level.
(2) Let requirements-cuda.txt includes requirements.txt
(3) Use an environment variable ORT_DISABLE_TRT_FLASH_ATTENTION=1 to
avoid black image in 2.1 model. Update benchmark and doc.
(4) Update document to include command lines to build ORT rocm from
source.
(5) Update optimize_pipeline.py so that user can disable packed qkv/kv
from command line options.
(6) Update document to use torch < 2.0 for onnx export.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../python/tools/transformers/float16.py      |   8 +-
 .../models/stable_diffusion/README.md         | 125 +++++++++++-------
 .../models/stable_diffusion/benchmark.py      |   5 +
 .../stable_diffusion/optimize_pipeline.py     |   6 +-
 .../stable_diffusion/requirements-cuda.txt    |   5 +-
 .../stable_diffusion/requirements-rocm.txt    |   1 +
 6 files changed, 93 insertions(+), 57 deletions(-)

diff --git a/onnxruntime/python/tools/transformers/float16.py b/onnxruntime/python/tools/transformers/float16.py
index 790d9d45c0..c2a7a055e9 100644
--- a/onnxruntime/python/tools/transformers/float16.py
+++ b/onnxruntime/python/tools/transformers/float16.py
@@ -53,17 +53,17 @@ def convert_np_to_float16(np_array, min_positive_val=5.96e-08, max_finite_val=65
         positive_max = np_array[np.where(np_array > 0)].max()
         positive_min = np_array[np.where(np_array > 0)].min()
         if positive_max >= max_finite_val:
-            logger.info(f"the float32 number {positive_max} will be truncated to {max_finite_val}")
+            logger.debug(f"the float32 number {positive_max} will be truncated to {max_finite_val}")
         if positive_min <= min_positive_val:
-            logger.info(f"the float32 number {positive_min} will be truncated to {min_positive_val}")
+            logger.debug(f"the float32 number {positive_min} will be truncated to {min_positive_val}")
 
     if np_array[np.where(np_array < 0)].shape[0] > 0:
         negative_max = np_array[np.where(np_array < 0)].max()
         negative_min = np_array[np.where(np_array < 0)].min()
         if negative_min <= -max_finite_val:
-            logger.info(f"the float32 number {negative_min} will be truncated to {-max_finite_val}")
+            logger.debug(f"the float32 number {negative_min} will be truncated to {-max_finite_val}")
         if negative_max >= -min_positive_val:
-            logger.info(f"the float32 number {negative_max} will be truncated to {-min_positive_val}")
+            logger.debug(f"the float32 number {negative_max} will be truncated to {-min_positive_val}")
 
     np_array = np.where(between(0, np_array, min_positive_val), min_positive_val, np_array)
     np_array = np.where(between(-min_positive_val, np_array, 0), -min_positive_val, np_array)
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md b/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md
index 980a867175..e47bcc62d9 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md
@@ -70,27 +70,23 @@ cd onnxruntime/python/tools/transformers/models/stable_diffusion
 
 ## Example of Stable Diffusion 1.5
 
-Below is an example to optimize Stable Diffusion 1.5 in Linux. For Windows OS, please change the format of path to be like `.\sd-v1-5` instead of `./sd-v1-5`.
+Below is an example to optimize Stable Diffusion 1.5 in Linux. For Windows OS, please change the format of path to be like `.\sd` instead of `./sd`.
 
 ### Setup Environment (CUDA)
 
+It is recommended to create a Conda environment with Python 3.8, 3.9 or 3.10, and run the model with [CUDA 11.7](https://developer.nvidia.com/cuda-11-7-0-download-archive) or 11.8.
 ```
-conda create -n py310 python=3.10
-conda activate py310
-pip install -r requirements.txt
+conda create -n py38 python=3.8
+conda activate py38
+pip install torch==1.13.1+cu117 --extra-index-url https://download.pytorch.org/whl/cu117
 pip install -r requirements-cuda.txt
 ```
 
-For Windows, the torch package installed from PyPI is CPU only. To enable support for GPU, it is necessary to install PyTorch version 1.13.1+cu117 or above using the following method:
-```
-pip install torch==1.13.1+cu117 --extra-index-url https://download.pytorch.org/whl/cu117
-```
-
-ONNX Runtime requires CUDA and [cuDNN](https://developer.nvidia.com/rdp/cudnn-download) for GPU inference. See https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html for compatible versions (like [CUDA 11.7](https://developer.nvidia.com/cuda-11-7-0-download-archive) and cuDNN 8.5.0.96 in Windows).
+ONNX Runtime requires CUDA and [cuDNN](https://developer.nvidia.com/rdp/cudnn-download) for GPU inference. See https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html for compatible versions.
 
 #### Install Nightly (Optional)
 
-Skip this step if you use onnxruntime-gpu 1.14.* release package.
+Skip this step if you use onnxruntime-gpu package from official releases.
 
 To try latest optimizations, you can install [ort-nightly-gpu](https://aiinfra.visualstudio.com/PublicPackages/_artifacts/feed/ORT-Nightly/PyPI/ort-nightly-gpu/) package like the following:
 
@@ -101,64 +97,97 @@ pip install ort-nightly-gpu -i https://aiinfra.pkgs.visualstudio.com/PublicPacka
 
 ### Setup Environment (ROCm)
 
-It is recommended that the users should run the model with ROCm 5.4 or newer and Python 3.9. Note that Windows is not
-supported for ROCm at the moment.
+It is recommended that the users run the model with ROCm 5.4 or newer and Python 3.8, 3.9 or 3.10.
+Note that Windows is not supported for ROCm at the moment.
 
 ```
-conda create -n py39 python=3.9
-conda activate py39
-pip install -r requirements.txt
+conda create -n py38 python=3.8
+conda activate py38
+wget https://repo.radeon.com/rocm/manylinux/rocm-rel-5.4/torch-1.12.1%2Brocm5.4-cp38-cp38-linux_x86_64.whl
+pip install torch-1.12.1+rocm5.4-cp38-cp38-linux_x86_64.whl
 pip install -r requirements-rocm.txt
 ```
 
-AMD GPU version of torch build can be installed from [AMD Radeon repo](https://repo.radeon.com/rocm/manylinux/rocm-rel-5.4/),
-user need to download the whl file and `pip install <file.whl>` manually. Or directly from https://download.pytorch.org/whl/rocm5.4.2 via
+AMD GPU version of PyTorch can be installed from [pytorch.org](https://pytorch.org/get-started/locally/) or [AMD Radeon repo](https://repo.radeon.com/rocm/manylinux/rocm-rel-5.4/).
+
+#### Install onnxruntime-rocm
+
+Here is an example to build onnxruntime from source with Rocm 5.4.2 in Ubuntu 20.04, and install the wheel.
+
+(1) Install [ROCm 5.4.2](https://docs.amd.com/bundle/ROCm-Installation-Guide-v5.4.2/page/How_to_Install_ROCm.html). Note that the version is also used in PyTorch 2.0 ROCm package.
+
+(2) Install some tools used in build:
 ```
-pip install torch==2.0.0 --index-url https://download.pytorch.org/whl/rocm5.4.2
+sudo apt-get update
+sudo apt-get install -y --no-install-recommends \
+        wget \
+        zip \
+        ca-certificates \
+        build-essential \
+        curl \
+        libcurl4-openssl-dev \
+        libssl-dev \
+        python3-dev
+pip install numpy packaging "wheel>=0.35.1"
+wget --quiet https://github.com/Kitware/CMake/releases/download/v3.26.3/cmake-3.26.3-linux-x86_64.tar.gz
+tar zxf cmake-3.26.3-linux-x86_64.tar.gz
+export PATH=${PWD}/cmake-3.26.3-linux-x86_64/bin:${PATH}
 ```
 
-Please follow the [official docs](https://onnxruntime.ai/docs/build/eps.html#amd-rocm) to build ONNXRuntime from source.
+(3) Build and Install ONNX Runtime
+```
+git clone https://github.com/microsoft/onnxruntime
+cd onnxruntime
+sh build.sh --config Release --use_rocm --rocm_home /opt/rocm --rocm_version 5.4.2 --build_wheel
+pip install build/Linux/Release/dist/*.whl
+```
+
+You can also follow the [official docs](https://onnxruntime.ai/docs/build/eps.html#amd-rocm) to build with docker.
 
 ### Export ONNX pipeline
+This step will export stable diffusion 1.5 to ONNX model in float32 using script from diffusers.
 
-This step will export stable diffusion 1.5 to ONNX model in float32 using script from diffusers. Before running the script, you need to be logged in via `huggingface-cli login`.
+It is recommended to use PyTorch 1.12.1 or 1.13.1 in this step. Using PyTorch 2.0 will encounter issue in exporting onnx.
 
 ```
 curl https://raw.githubusercontent.com/huggingface/diffusers/v0.15.1/scripts/convert_stable_diffusion_checkpoint_to_onnx.py > convert_sd_onnx.py
-python convert_sd_onnx.py --model_path runwayml/stable-diffusion-v1-5  --output_path  ./sd-v1-5
+python convert_sd_onnx.py --model_path runwayml/stable-diffusion-v1-5  --output_path  ./sd_v1_5/fp32
 ```
 
 ### Optimize ONNX Pipeline
 
 Example to optimize the exported float32 ONNX models, and save to float16 models:
-
 ```
-python -m onnxruntime.transformers.models.stable_diffusion.optimize_pipeline -i ./sd-v1-5 -o ./sd-v1-5-fp16 --float16
+python -m onnxruntime.transformers.models.stable_diffusion.optimize_pipeline -i ./sd_v1_5/fp32 -o ./sd_v1_5/fp16 --float16
 ```
 
 If you installed ONNX Runtime v1.14, some optimizations (packed QKV and BiasAdd) will be disabled automatically since they are not available in v1.14.
 
-For Stable Diffusion 2.1 model with CUDA EP, you will need force Attention to run in float32 to avoid black image by appending `--force_fp32_ops unet:Attention` to the command line.
-If you are using nightly package, append `--force_fp32_ops unet:MultiHeadAttention` instead.
-
 ### Run Benchmark
 
 The benchmark.py script will run a warm-up prompt twice, and measure the peak GPU memory usage in these two runs, then record them as first_run_memory_MB and second_run_memory_MB. Then it will run 5 runs to get average latency (in seconds), and output the results to benchmark_result.csv.
 
 Note that the first run might need more time and memory: For example, cuDNN convolution algorithm search or model compile happens in the first run.
 
-Example to benchmark the optimized pipeline with batch size 1 on CUDA EP:
+To avoid black image output for Stable Diffusion 2.1 with CUDA EP, we can set an environment variable before inferencing:
 ```
-python -m onnxruntime.transformers.models.stable_diffusion.benchmark -p ./sd-v1-5-fp16/ -b 1
+export ORT_DISABLE_TRT_FLASH_ATTENTION=1
+```
+
+Before running benchmark on PyTorch, you need to be logged in via `huggingface-cli login` once.
+
+Example to benchmark the optimized pipeline of stable diffusion 1.5 with batch size 1 on CUDA EP:
+```
+python benchmark.py -p ./sd_v1_5/fp16 -b 1 -v 1.5
 ```
 
 On ROCm EP, use the following command instead:
 ```
-python -m onnxruntime.transformers.models.stable_diffusion.benchmark -p ./sd-v1-5-fp16/ -b 1 --tuning --provider=rocm
+python benchmark.py -p ./sd_v1_5/fp16 -b 1 --tuning --provider rocm -v 1.5
 ```
 
-Note: you can substitute `python -m onnxruntime.transformers.models.stable_diffusion.benchmark` with `python benchmark.py` if your current working directory is this files directory.
-In the following, we will use it interchangeably.
+For ROCm EP, you can substitute `python benchmark.py` with `python -m onnxruntime.transformers.models.stable_diffusion.benchmark` since
+the installed package is built from source. For CUDA, it is recommended to run `python benchmark.py` with the latest benchmark script.
 
 For ROCm EP, the `--tuning` is mandatory because we heavily rely on tuning to find the runable kernels for ORT `OpKernel`s.
 
@@ -169,20 +198,22 @@ The default parameters are stable diffusion version=1.5, height=512, width=512,
 Run PyTorch 1.13.1+cu117 with xFormers like the following
 
 ```
-python benchmark.py -e torch -b 1 --use_xformers
+pip install xformers==0.0.16
+python benchmark.py -e torch -b 1 --use_xformers -v 1.5
 ```
 
 ### Run Benchmark with PyTorch 2.0 with torch.compile
 
-Let's create a new environment to run PyTorch 2.0:
-
+For CUDA:
 ```
-conda create -n pt2 python=3.10
-conda activate pt2
-pip install torch --index-url https://download.pytorch.org/whl/cu117
-pip install -r requirements.txt
-pip install -r requirements_cuda.txt  # or requirements_rocm.txt
-python benchmark.py -e torch -b 1 --enable_torch_compile
+pip install torch --upgrade --index-url https://download.pytorch.org/whl/cu117
+python benchmark.py -e torch -b 1 --enable_torch_compile -v 1.5
+```
+
+For ROCm:
+```
+pip install torch --upgrade --index-url https://download.pytorch.org/whl/rocm5.4.2
+python benchmark.py -e torch -b 1 --enable_torch_compile --provider rocm  -v 1.5
 ```
 
 Sometime, it complains ptxas not found when there are multiple CUDA versions installed. It can be fixed like `export TRITON_PTXAS_PATH=/usr/local/cuda-11.7/bin/ptxas` before running benchmark.
@@ -273,15 +304,15 @@ Results are from Standard_NC4as_T4_v3 Azure virtual machine:
 
 | engine      | version                 | provider              | batch size | average latency | first run memory MB | second run memory MB |
 | ----------- | ----------------------- | --------------------- | ---------- | --------------- | ------------------- | -------------------- |
-| onnxruntime | dev                     | ROCMExecutionProvider | 1          | 2.2             | 5,548               | 4,908                |
+| onnxruntime | 1.15.0+rocm5.4.2        | ROCMExecutionProvider | 1          | 2.2             | 5,548               | 4,908                |
 | torch       | 1.12.1+rocm5.4          | -                     | 1          | 3.4             | 6,653               | 4,613                |
 | torch       | 2.0.0+rocm5.4.2         | default               | 1          | 3.2             | 5,977               | 4,368                |
 | torch       | 2.0.0+rocm5.4.2         | compile               | 1          | 3.0             | 5,869               | 4,266                |
-| onnxruntime | dev                     | ROCMExecutionProvider | 4          | 6.6             | 5,546               | 4,906                |
+| onnxruntime | 1.15.0+rocm5.4.2        | ROCMExecutionProvider | 4          | 6.6             | 5,546               | 4,906                |
 | torch       | 1.12.1+rocm5.4          | -                     | 4          | 10.1            | 19,477              | 11,325               |
 | torch       | 2.0.0+rocm5.4.2         | default               | 4          | 10.5            | 13,051              | 7,300                |
 | torch       | 2.0.0+rocm5.4.2         | compile               | 4          | 9.2             | 12,879              | 7,190                |
-| onnxruntime | dev                     | ROCMExecutionProvider | 8          | 12.5            | 9,778               | 9,006                |
+| onnxruntime | 1.15.0+rocm5.4.2        | ROCMExecutionProvider | 8          | 12.5            | 9,778               | 9,006                |
 | torch       | 1.12.1+rocm5.4          | -                     | 8          | 19.3            | 55,851              | 20,014               |
 | torch       | 2.0.0+rocm5.4.2         | default               | 8          | 20.3            | 23,551              | 11,930               |
 | torch       | 2.0.0+rocm5.4.2         | compile               | 8          | 17.8            | 23,303              | 11,800               |
@@ -290,15 +321,15 @@ Results are from Standard_NC4as_T4_v3 Azure virtual machine:
 
 | engine      | version                 | provider              | batch size | average latency | first run memory MB | second run memory MB |
 | ----------- | ----------------------- | --------------------- | ---------- | --------------- | ------------------- | -------------------- |
-| onnxruntime | dev                     | ROCMExecutionProvider | 1          | 2.4             | 5,254               | 4,614                |
+| onnxruntime | 1.15.0+rocm5.4.2        | ROCMExecutionProvider | 1          | 2.4             | 5,254               | 4,614                |
 | torch       | 1.12.1+rocm5.4          | -                     | 1          | 3.5             | 5,771               | 4,672                |
 | torch       | 2.0.0+rocm5.4.2         | default               | 1          | 3.5             | 5,811               | 4,206                |
 | torch       | 2.0.0+rocm5.4.2         | compile               | 1          | 3.1             | 5,774               | 4,168                |
-| onnxruntime | dev                     | ROCMExecutionProvider | 4          | 7.5             | 7,290               | 6,646                |
+| onnxruntime | 1.15.0+rocm5.4.2        | ROCMExecutionProvider | 4          | 7.5             | 7,290               | 6,646                |
 | torch       | 1.12.1+rocm5.4          | -                     | 4          | 10.7            | 19,334              | 11,181               |
 | torch       | 2.0.0+rocm5.4.2         | default               | 4          | 11.5            | 12,881              | 7,151                |
 | torch       | 2.0.0+rocm5.4.2         | compile               | 4          | 10.0            | 12,740              | 7,073                |
-| onnxruntime | dev                     | ROCMExecutionProvider | 8          | 14.4            | 7,320               | 6,676                |
+| onnxruntime | 1.15.0+rocm5.4.2        | ROCMExecutionProvider | 8          | 14.4            | 7,320               | 6,676                |
 | torch       | 1.12.1+rocm5.4          | -                     | 8          | 20.2            | 31,820              | 19,908               |
 | torch       | 2.0.0+rocm5.4.2         | default               | 8          | 22.2            | 23,415              | 11,815               |
 | torch       | 2.0.0+rocm5.4.2         | compile               | 8          | 19.3            | 23,154              | 11,667               |
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py
index 1226c3bfab..4e00ded9e3 100755
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py
@@ -641,6 +641,11 @@ def main():
     if args.engine == "onnxruntime":
         assert args.pipeline, "--pipeline should be specified for onnxruntime engine"
 
+        if args.version in ["2.1"]:
+            # Set a flag to avoid overflow in attention, which causes black image output in SD 2.1 model
+            # This shall be done before the first inference run.
+            os.environ["ORT_DISABLE_TRT_FLASH_ATTENTION"] = "1"
+
         result = run_ort(
             sd_model,
             args.pipeline,
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/optimize_pipeline.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/optimize_pipeline.py
index c74418f58a..5e8a25eb0b 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/optimize_pipeline.py
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/optimize_pipeline.py
@@ -131,9 +131,9 @@ def optimize_sd_pipeline(
         if model_type in ["unet"]:
             # Some optimizations are not available in v1.14 or older version: packed QKV and BiasAdd
             has_all_optimizations = version.parse(onnxruntime.__version__) >= version.parse("1.15.0")
-            fusion_options.enable_packed_kv = float16
-            fusion_options.enable_packed_qkv = float16 and has_all_optimizations
-            fusion_options.enable_bias_add = has_all_optimizations
+            fusion_options.enable_packed_kv = float16 and fusion_options.enable_packed_kv
+            fusion_options.enable_packed_qkv = float16 and has_all_optimizations and fusion_options.enable_packed_qkv
+            fusion_options.enable_bias_add = has_all_optimizations and fusion_options.enable_bias_add
 
         m = optimize_model(
             str(onnx_model_path),
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-cuda.txt b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-cuda.txt
index d96199fc2f..18852f515a 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-cuda.txt
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-cuda.txt
@@ -1,7 +1,6 @@
-# Install the following package in python 3.10
+-r requirements.txt
 onnxruntime-gpu>=1.14
 py3nvml==0.2.7
-xformers==0.0.16
-#For Windows, need install PyTorch 1.13.1+cu117 since torch in pypi is CPU version
+#To export onnx of stable diffusion, please install PyTorch 1.13.1+cu117
 #--extra-index-url https://download.pytorch.org/whl/cu117
 #torch==1.13.1+cu117
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-rocm.txt b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-rocm.txt
index ada3db3020..c0a925e25b 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-rocm.txt
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-rocm.txt
@@ -1,3 +1,4 @@
+-r requirements.txt
 # Install onnxruntime-rocm or onnxruntime_training
 # Build onnxruntime-rocm from source
 # Directly install pre-built onnxruntime/onnxruntime-training rocm python package is not possible at the moment.