mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-07-03 03:58:54 +00:00
update stable diffusion script and doc (#15846)
### Description Update script: (1) change some float16 verbose logging to debug level. (2) Let requirements-cuda.txt includes requirements.txt (3) Use an environment variable ORT_DISABLE_TRT_FLASH_ATTENTION=1 to avoid black image in 2.1 model. Update benchmark and doc. (4) Update document to include command lines to build ORT rocm from source. (5) Update optimize_pipeline.py so that user can disable packed qkv/kv from command line options. (6) Update document to use torch < 2.0 for onnx export. ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. -->
This commit is contained in:
parent
b473e3f3c6
commit
e0c1fa35a8
6 changed files with 93 additions and 57 deletions
|
|
@ -53,17 +53,17 @@ def convert_np_to_float16(np_array, min_positive_val=5.96e-08, max_finite_val=65
|
|||
positive_max = np_array[np.where(np_array > 0)].max()
|
||||
positive_min = np_array[np.where(np_array > 0)].min()
|
||||
if positive_max >= max_finite_val:
|
||||
logger.info(f"the float32 number {positive_max} will be truncated to {max_finite_val}")
|
||||
logger.debug(f"the float32 number {positive_max} will be truncated to {max_finite_val}")
|
||||
if positive_min <= min_positive_val:
|
||||
logger.info(f"the float32 number {positive_min} will be truncated to {min_positive_val}")
|
||||
logger.debug(f"the float32 number {positive_min} will be truncated to {min_positive_val}")
|
||||
|
||||
if np_array[np.where(np_array < 0)].shape[0] > 0:
|
||||
negative_max = np_array[np.where(np_array < 0)].max()
|
||||
negative_min = np_array[np.where(np_array < 0)].min()
|
||||
if negative_min <= -max_finite_val:
|
||||
logger.info(f"the float32 number {negative_min} will be truncated to {-max_finite_val}")
|
||||
logger.debug(f"the float32 number {negative_min} will be truncated to {-max_finite_val}")
|
||||
if negative_max >= -min_positive_val:
|
||||
logger.info(f"the float32 number {negative_max} will be truncated to {-min_positive_val}")
|
||||
logger.debug(f"the float32 number {negative_max} will be truncated to {-min_positive_val}")
|
||||
|
||||
np_array = np.where(between(0, np_array, min_positive_val), min_positive_val, np_array)
|
||||
np_array = np.where(between(-min_positive_val, np_array, 0), -min_positive_val, np_array)
|
||||
|
|
|
|||
|
|
@ -70,27 +70,23 @@ cd onnxruntime/python/tools/transformers/models/stable_diffusion
|
|||
|
||||
## Example of Stable Diffusion 1.5
|
||||
|
||||
Below is an example to optimize Stable Diffusion 1.5 in Linux. For Windows OS, please change the format of path to be like `.\sd-v1-5` instead of `./sd-v1-5`.
|
||||
Below is an example to optimize Stable Diffusion 1.5 in Linux. For Windows OS, please change the format of path to be like `.\sd` instead of `./sd`.
|
||||
|
||||
### Setup Environment (CUDA)
|
||||
|
||||
It is recommended to create a Conda environment with Python 3.8, 3.9 or 3.10, and run the model with [CUDA 11.7](https://developer.nvidia.com/cuda-11-7-0-download-archive) or 11.8.
|
||||
```
|
||||
conda create -n py310 python=3.10
|
||||
conda activate py310
|
||||
pip install -r requirements.txt
|
||||
conda create -n py38 python=3.8
|
||||
conda activate py38
|
||||
pip install torch==1.13.1+cu117 --extra-index-url https://download.pytorch.org/whl/cu117
|
||||
pip install -r requirements-cuda.txt
|
||||
```
|
||||
|
||||
For Windows, the torch package installed from PyPI is CPU only. To enable support for GPU, it is necessary to install PyTorch version 1.13.1+cu117 or above using the following method:
|
||||
```
|
||||
pip install torch==1.13.1+cu117 --extra-index-url https://download.pytorch.org/whl/cu117
|
||||
```
|
||||
|
||||
ONNX Runtime requires CUDA and [cuDNN](https://developer.nvidia.com/rdp/cudnn-download) for GPU inference. See https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html for compatible versions (like [CUDA 11.7](https://developer.nvidia.com/cuda-11-7-0-download-archive) and cuDNN 8.5.0.96 in Windows).
|
||||
ONNX Runtime requires CUDA and [cuDNN](https://developer.nvidia.com/rdp/cudnn-download) for GPU inference. See https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html for compatible versions.
|
||||
|
||||
#### Install Nightly (Optional)
|
||||
|
||||
Skip this step if you use onnxruntime-gpu 1.14.* release package.
|
||||
Skip this step if you use onnxruntime-gpu package from official releases.
|
||||
|
||||
To try latest optimizations, you can install [ort-nightly-gpu](https://aiinfra.visualstudio.com/PublicPackages/_artifacts/feed/ORT-Nightly/PyPI/ort-nightly-gpu/) package like the following:
|
||||
|
||||
|
|
@ -101,64 +97,97 @@ pip install ort-nightly-gpu -i https://aiinfra.pkgs.visualstudio.com/PublicPacka
|
|||
|
||||
### Setup Environment (ROCm)
|
||||
|
||||
It is recommended that the users should run the model with ROCm 5.4 or newer and Python 3.9. Note that Windows is not
|
||||
supported for ROCm at the moment.
|
||||
It is recommended that the users run the model with ROCm 5.4 or newer and Python 3.8, 3.9 or 3.10.
|
||||
Note that Windows is not supported for ROCm at the moment.
|
||||
|
||||
```
|
||||
conda create -n py39 python=3.9
|
||||
conda activate py39
|
||||
pip install -r requirements.txt
|
||||
conda create -n py38 python=3.8
|
||||
conda activate py38
|
||||
wget https://repo.radeon.com/rocm/manylinux/rocm-rel-5.4/torch-1.12.1%2Brocm5.4-cp38-cp38-linux_x86_64.whl
|
||||
pip install torch-1.12.1+rocm5.4-cp38-cp38-linux_x86_64.whl
|
||||
pip install -r requirements-rocm.txt
|
||||
```
|
||||
|
||||
AMD GPU version of torch build can be installed from [AMD Radeon repo](https://repo.radeon.com/rocm/manylinux/rocm-rel-5.4/),
|
||||
user need to download the whl file and `pip install <file.whl>` manually. Or directly from https://download.pytorch.org/whl/rocm5.4.2 via
|
||||
AMD GPU version of PyTorch can be installed from [pytorch.org](https://pytorch.org/get-started/locally/) or [AMD Radeon repo](https://repo.radeon.com/rocm/manylinux/rocm-rel-5.4/).
|
||||
|
||||
#### Install onnxruntime-rocm
|
||||
|
||||
Here is an example to build onnxruntime from source with Rocm 5.4.2 in Ubuntu 20.04, and install the wheel.
|
||||
|
||||
(1) Install [ROCm 5.4.2](https://docs.amd.com/bundle/ROCm-Installation-Guide-v5.4.2/page/How_to_Install_ROCm.html). Note that the version is also used in PyTorch 2.0 ROCm package.
|
||||
|
||||
(2) Install some tools used in build:
|
||||
```
|
||||
pip install torch==2.0.0 --index-url https://download.pytorch.org/whl/rocm5.4.2
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y --no-install-recommends \
|
||||
wget \
|
||||
zip \
|
||||
ca-certificates \
|
||||
build-essential \
|
||||
curl \
|
||||
libcurl4-openssl-dev \
|
||||
libssl-dev \
|
||||
python3-dev
|
||||
pip install numpy packaging "wheel>=0.35.1"
|
||||
wget --quiet https://github.com/Kitware/CMake/releases/download/v3.26.3/cmake-3.26.3-linux-x86_64.tar.gz
|
||||
tar zxf cmake-3.26.3-linux-x86_64.tar.gz
|
||||
export PATH=${PWD}/cmake-3.26.3-linux-x86_64/bin:${PATH}
|
||||
```
|
||||
|
||||
Please follow the [official docs](https://onnxruntime.ai/docs/build/eps.html#amd-rocm) to build ONNXRuntime from source.
|
||||
(3) Build and Install ONNX Runtime
|
||||
```
|
||||
git clone https://github.com/microsoft/onnxruntime
|
||||
cd onnxruntime
|
||||
sh build.sh --config Release --use_rocm --rocm_home /opt/rocm --rocm_version 5.4.2 --build_wheel
|
||||
pip install build/Linux/Release/dist/*.whl
|
||||
```
|
||||
|
||||
You can also follow the [official docs](https://onnxruntime.ai/docs/build/eps.html#amd-rocm) to build with docker.
|
||||
|
||||
### Export ONNX pipeline
|
||||
This step will export stable diffusion 1.5 to ONNX model in float32 using script from diffusers.
|
||||
|
||||
This step will export stable diffusion 1.5 to ONNX model in float32 using script from diffusers. Before running the script, you need to be logged in via `huggingface-cli login`.
|
||||
It is recommended to use PyTorch 1.12.1 or 1.13.1 in this step. Using PyTorch 2.0 will encounter issue in exporting onnx.
|
||||
|
||||
```
|
||||
curl https://raw.githubusercontent.com/huggingface/diffusers/v0.15.1/scripts/convert_stable_diffusion_checkpoint_to_onnx.py > convert_sd_onnx.py
|
||||
python convert_sd_onnx.py --model_path runwayml/stable-diffusion-v1-5 --output_path ./sd-v1-5
|
||||
python convert_sd_onnx.py --model_path runwayml/stable-diffusion-v1-5 --output_path ./sd_v1_5/fp32
|
||||
```
|
||||
|
||||
### Optimize ONNX Pipeline
|
||||
|
||||
Example to optimize the exported float32 ONNX models, and save to float16 models:
|
||||
|
||||
```
|
||||
python -m onnxruntime.transformers.models.stable_diffusion.optimize_pipeline -i ./sd-v1-5 -o ./sd-v1-5-fp16 --float16
|
||||
python -m onnxruntime.transformers.models.stable_diffusion.optimize_pipeline -i ./sd_v1_5/fp32 -o ./sd_v1_5/fp16 --float16
|
||||
```
|
||||
|
||||
If you installed ONNX Runtime v1.14, some optimizations (packed QKV and BiasAdd) will be disabled automatically since they are not available in v1.14.
|
||||
|
||||
For Stable Diffusion 2.1 model with CUDA EP, you will need force Attention to run in float32 to avoid black image by appending `--force_fp32_ops unet:Attention` to the command line.
|
||||
If you are using nightly package, append `--force_fp32_ops unet:MultiHeadAttention` instead.
|
||||
|
||||
### Run Benchmark
|
||||
|
||||
The benchmark.py script will run a warm-up prompt twice, and measure the peak GPU memory usage in these two runs, then record them as first_run_memory_MB and second_run_memory_MB. Then it will run 5 runs to get average latency (in seconds), and output the results to benchmark_result.csv.
|
||||
|
||||
Note that the first run might need more time and memory: For example, cuDNN convolution algorithm search or model compile happens in the first run.
|
||||
|
||||
Example to benchmark the optimized pipeline with batch size 1 on CUDA EP:
|
||||
To avoid black image output for Stable Diffusion 2.1 with CUDA EP, we can set an environment variable before inferencing:
|
||||
```
|
||||
python -m onnxruntime.transformers.models.stable_diffusion.benchmark -p ./sd-v1-5-fp16/ -b 1
|
||||
export ORT_DISABLE_TRT_FLASH_ATTENTION=1
|
||||
```
|
||||
|
||||
Before running benchmark on PyTorch, you need to be logged in via `huggingface-cli login` once.
|
||||
|
||||
Example to benchmark the optimized pipeline of stable diffusion 1.5 with batch size 1 on CUDA EP:
|
||||
```
|
||||
python benchmark.py -p ./sd_v1_5/fp16 -b 1 -v 1.5
|
||||
```
|
||||
|
||||
On ROCm EP, use the following command instead:
|
||||
```
|
||||
python -m onnxruntime.transformers.models.stable_diffusion.benchmark -p ./sd-v1-5-fp16/ -b 1 --tuning --provider=rocm
|
||||
python benchmark.py -p ./sd_v1_5/fp16 -b 1 --tuning --provider rocm -v 1.5
|
||||
```
|
||||
|
||||
Note: you can substitute `python -m onnxruntime.transformers.models.stable_diffusion.benchmark` with `python benchmark.py` if your current working directory is this files directory.
|
||||
In the following, we will use it interchangeably.
|
||||
For ROCm EP, you can substitute `python benchmark.py` with `python -m onnxruntime.transformers.models.stable_diffusion.benchmark` since
|
||||
the installed package is built from source. For CUDA, it is recommended to run `python benchmark.py` with the latest benchmark script.
|
||||
|
||||
For ROCm EP, the `--tuning` is mandatory because we heavily rely on tuning to find the runable kernels for ORT `OpKernel`s.
|
||||
|
||||
|
|
@ -169,20 +198,22 @@ The default parameters are stable diffusion version=1.5, height=512, width=512,
|
|||
Run PyTorch 1.13.1+cu117 with xFormers like the following
|
||||
|
||||
```
|
||||
python benchmark.py -e torch -b 1 --use_xformers
|
||||
pip install xformers==0.0.16
|
||||
python benchmark.py -e torch -b 1 --use_xformers -v 1.5
|
||||
```
|
||||
|
||||
### Run Benchmark with PyTorch 2.0 with torch.compile
|
||||
|
||||
Let's create a new environment to run PyTorch 2.0:
|
||||
|
||||
For CUDA:
|
||||
```
|
||||
conda create -n pt2 python=3.10
|
||||
conda activate pt2
|
||||
pip install torch --index-url https://download.pytorch.org/whl/cu117
|
||||
pip install -r requirements.txt
|
||||
pip install -r requirements_cuda.txt # or requirements_rocm.txt
|
||||
python benchmark.py -e torch -b 1 --enable_torch_compile
|
||||
pip install torch --upgrade --index-url https://download.pytorch.org/whl/cu117
|
||||
python benchmark.py -e torch -b 1 --enable_torch_compile -v 1.5
|
||||
```
|
||||
|
||||
For ROCm:
|
||||
```
|
||||
pip install torch --upgrade --index-url https://download.pytorch.org/whl/rocm5.4.2
|
||||
python benchmark.py -e torch -b 1 --enable_torch_compile --provider rocm -v 1.5
|
||||
```
|
||||
|
||||
Sometime, it complains ptxas not found when there are multiple CUDA versions installed. It can be fixed like `export TRITON_PTXAS_PATH=/usr/local/cuda-11.7/bin/ptxas` before running benchmark.
|
||||
|
|
@ -273,15 +304,15 @@ Results are from Standard_NC4as_T4_v3 Azure virtual machine:
|
|||
|
||||
| engine | version | provider | batch size | average latency | first run memory MB | second run memory MB |
|
||||
| ----------- | ----------------------- | --------------------- | ---------- | --------------- | ------------------- | -------------------- |
|
||||
| onnxruntime | dev | ROCMExecutionProvider | 1 | 2.2 | 5,548 | 4,908 |
|
||||
| onnxruntime | 1.15.0+rocm5.4.2 | ROCMExecutionProvider | 1 | 2.2 | 5,548 | 4,908 |
|
||||
| torch | 1.12.1+rocm5.4 | - | 1 | 3.4 | 6,653 | 4,613 |
|
||||
| torch | 2.0.0+rocm5.4.2 | default | 1 | 3.2 | 5,977 | 4,368 |
|
||||
| torch | 2.0.0+rocm5.4.2 | compile | 1 | 3.0 | 5,869 | 4,266 |
|
||||
| onnxruntime | dev | ROCMExecutionProvider | 4 | 6.6 | 5,546 | 4,906 |
|
||||
| onnxruntime | 1.15.0+rocm5.4.2 | ROCMExecutionProvider | 4 | 6.6 | 5,546 | 4,906 |
|
||||
| torch | 1.12.1+rocm5.4 | - | 4 | 10.1 | 19,477 | 11,325 |
|
||||
| torch | 2.0.0+rocm5.4.2 | default | 4 | 10.5 | 13,051 | 7,300 |
|
||||
| torch | 2.0.0+rocm5.4.2 | compile | 4 | 9.2 | 12,879 | 7,190 |
|
||||
| onnxruntime | dev | ROCMExecutionProvider | 8 | 12.5 | 9,778 | 9,006 |
|
||||
| onnxruntime | 1.15.0+rocm5.4.2 | ROCMExecutionProvider | 8 | 12.5 | 9,778 | 9,006 |
|
||||
| torch | 1.12.1+rocm5.4 | - | 8 | 19.3 | 55,851 | 20,014 |
|
||||
| torch | 2.0.0+rocm5.4.2 | default | 8 | 20.3 | 23,551 | 11,930 |
|
||||
| torch | 2.0.0+rocm5.4.2 | compile | 8 | 17.8 | 23,303 | 11,800 |
|
||||
|
|
@ -290,15 +321,15 @@ Results are from Standard_NC4as_T4_v3 Azure virtual machine:
|
|||
|
||||
| engine | version | provider | batch size | average latency | first run memory MB | second run memory MB |
|
||||
| ----------- | ----------------------- | --------------------- | ---------- | --------------- | ------------------- | -------------------- |
|
||||
| onnxruntime | dev | ROCMExecutionProvider | 1 | 2.4 | 5,254 | 4,614 |
|
||||
| onnxruntime | 1.15.0+rocm5.4.2 | ROCMExecutionProvider | 1 | 2.4 | 5,254 | 4,614 |
|
||||
| torch | 1.12.1+rocm5.4 | - | 1 | 3.5 | 5,771 | 4,672 |
|
||||
| torch | 2.0.0+rocm5.4.2 | default | 1 | 3.5 | 5,811 | 4,206 |
|
||||
| torch | 2.0.0+rocm5.4.2 | compile | 1 | 3.1 | 5,774 | 4,168 |
|
||||
| onnxruntime | dev | ROCMExecutionProvider | 4 | 7.5 | 7,290 | 6,646 |
|
||||
| onnxruntime | 1.15.0+rocm5.4.2 | ROCMExecutionProvider | 4 | 7.5 | 7,290 | 6,646 |
|
||||
| torch | 1.12.1+rocm5.4 | - | 4 | 10.7 | 19,334 | 11,181 |
|
||||
| torch | 2.0.0+rocm5.4.2 | default | 4 | 11.5 | 12,881 | 7,151 |
|
||||
| torch | 2.0.0+rocm5.4.2 | compile | 4 | 10.0 | 12,740 | 7,073 |
|
||||
| onnxruntime | dev | ROCMExecutionProvider | 8 | 14.4 | 7,320 | 6,676 |
|
||||
| onnxruntime | 1.15.0+rocm5.4.2 | ROCMExecutionProvider | 8 | 14.4 | 7,320 | 6,676 |
|
||||
| torch | 1.12.1+rocm5.4 | - | 8 | 20.2 | 31,820 | 19,908 |
|
||||
| torch | 2.0.0+rocm5.4.2 | default | 8 | 22.2 | 23,415 | 11,815 |
|
||||
| torch | 2.0.0+rocm5.4.2 | compile | 8 | 19.3 | 23,154 | 11,667 |
|
||||
|
|
|
|||
|
|
@ -641,6 +641,11 @@ def main():
|
|||
if args.engine == "onnxruntime":
|
||||
assert args.pipeline, "--pipeline should be specified for onnxruntime engine"
|
||||
|
||||
if args.version in ["2.1"]:
|
||||
# Set a flag to avoid overflow in attention, which causes black image output in SD 2.1 model
|
||||
# This shall be done before the first inference run.
|
||||
os.environ["ORT_DISABLE_TRT_FLASH_ATTENTION"] = "1"
|
||||
|
||||
result = run_ort(
|
||||
sd_model,
|
||||
args.pipeline,
|
||||
|
|
|
|||
|
|
@ -131,9 +131,9 @@ def optimize_sd_pipeline(
|
|||
if model_type in ["unet"]:
|
||||
# Some optimizations are not available in v1.14 or older version: packed QKV and BiasAdd
|
||||
has_all_optimizations = version.parse(onnxruntime.__version__) >= version.parse("1.15.0")
|
||||
fusion_options.enable_packed_kv = float16
|
||||
fusion_options.enable_packed_qkv = float16 and has_all_optimizations
|
||||
fusion_options.enable_bias_add = has_all_optimizations
|
||||
fusion_options.enable_packed_kv = float16 and fusion_options.enable_packed_kv
|
||||
fusion_options.enable_packed_qkv = float16 and has_all_optimizations and fusion_options.enable_packed_qkv
|
||||
fusion_options.enable_bias_add = has_all_optimizations and fusion_options.enable_bias_add
|
||||
|
||||
m = optimize_model(
|
||||
str(onnx_model_path),
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
# Install the following package in python 3.10
|
||||
-r requirements.txt
|
||||
onnxruntime-gpu>=1.14
|
||||
py3nvml==0.2.7
|
||||
xformers==0.0.16
|
||||
#For Windows, need install PyTorch 1.13.1+cu117 since torch in pypi is CPU version
|
||||
#To export onnx of stable diffusion, please install PyTorch 1.13.1+cu117
|
||||
#--extra-index-url https://download.pytorch.org/whl/cu117
|
||||
#torch==1.13.1+cu117
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
-r requirements.txt
|
||||
# Install onnxruntime-rocm or onnxruntime_training
|
||||
# Build onnxruntime-rocm from source
|
||||
# Directly install pre-built onnxruntime/onnxruntime-training rocm python package is not possible at the moment.
|
||||
|
|
|
|||
Loading…
Reference in a new issue