From e0c1fa35a87657e3e7945fbc7422d69226531fe8 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Tue, 9 May 2023 15:29:13 -0700 Subject: [PATCH] update stable diffusion script and doc (#15846) ### Description Update script: (1) change some float16 verbose logging to debug level. (2) Let requirements-cuda.txt includes requirements.txt (3) Use an environment variable ORT_DISABLE_TRT_FLASH_ATTENTION=1 to avoid black image in 2.1 model. Update benchmark and doc. (4) Update document to include command lines to build ORT rocm from source. (5) Update optimize_pipeline.py so that user can disable packed qkv/kv from command line options. (6) Update document to use torch < 2.0 for onnx export. ### Motivation and Context --- .../python/tools/transformers/float16.py | 8 +- .../models/stable_diffusion/README.md | 125 +++++++++++------- .../models/stable_diffusion/benchmark.py | 5 + .../stable_diffusion/optimize_pipeline.py | 6 +- .../stable_diffusion/requirements-cuda.txt | 5 +- .../stable_diffusion/requirements-rocm.txt | 1 + 6 files changed, 93 insertions(+), 57 deletions(-) diff --git a/onnxruntime/python/tools/transformers/float16.py b/onnxruntime/python/tools/transformers/float16.py index 790d9d45c0..c2a7a055e9 100644 --- a/onnxruntime/python/tools/transformers/float16.py +++ b/onnxruntime/python/tools/transformers/float16.py @@ -53,17 +53,17 @@ def convert_np_to_float16(np_array, min_positive_val=5.96e-08, max_finite_val=65 positive_max = np_array[np.where(np_array > 0)].max() positive_min = np_array[np.where(np_array > 0)].min() if positive_max >= max_finite_val: - logger.info(f"the float32 number {positive_max} will be truncated to {max_finite_val}") + logger.debug(f"the float32 number {positive_max} will be truncated to {max_finite_val}") if positive_min <= min_positive_val: - logger.info(f"the float32 number {positive_min} will be truncated to {min_positive_val}") + logger.debug(f"the float32 number {positive_min} will be truncated to {min_positive_val}") if np_array[np.where(np_array < 0)].shape[0] > 0: negative_max = np_array[np.where(np_array < 0)].max() negative_min = np_array[np.where(np_array < 0)].min() if negative_min <= -max_finite_val: - logger.info(f"the float32 number {negative_min} will be truncated to {-max_finite_val}") + logger.debug(f"the float32 number {negative_min} will be truncated to {-max_finite_val}") if negative_max >= -min_positive_val: - logger.info(f"the float32 number {negative_max} will be truncated to {-min_positive_val}") + logger.debug(f"the float32 number {negative_max} will be truncated to {-min_positive_val}") np_array = np.where(between(0, np_array, min_positive_val), min_positive_val, np_array) np_array = np.where(between(-min_positive_val, np_array, 0), -min_positive_val, np_array) diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md b/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md index 980a867175..e47bcc62d9 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md @@ -70,27 +70,23 @@ cd onnxruntime/python/tools/transformers/models/stable_diffusion ## Example of Stable Diffusion 1.5 -Below is an example to optimize Stable Diffusion 1.5 in Linux. For Windows OS, please change the format of path to be like `.\sd-v1-5` instead of `./sd-v1-5`. +Below is an example to optimize Stable Diffusion 1.5 in Linux. For Windows OS, please change the format of path to be like `.\sd` instead of `./sd`. ### Setup Environment (CUDA) +It is recommended to create a Conda environment with Python 3.8, 3.9 or 3.10, and run the model with [CUDA 11.7](https://developer.nvidia.com/cuda-11-7-0-download-archive) or 11.8. ``` -conda create -n py310 python=3.10 -conda activate py310 -pip install -r requirements.txt +conda create -n py38 python=3.8 +conda activate py38 +pip install torch==1.13.1+cu117 --extra-index-url https://download.pytorch.org/whl/cu117 pip install -r requirements-cuda.txt ``` -For Windows, the torch package installed from PyPI is CPU only. To enable support for GPU, it is necessary to install PyTorch version 1.13.1+cu117 or above using the following method: -``` -pip install torch==1.13.1+cu117 --extra-index-url https://download.pytorch.org/whl/cu117 -``` - -ONNX Runtime requires CUDA and [cuDNN](https://developer.nvidia.com/rdp/cudnn-download) for GPU inference. See https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html for compatible versions (like [CUDA 11.7](https://developer.nvidia.com/cuda-11-7-0-download-archive) and cuDNN 8.5.0.96 in Windows). +ONNX Runtime requires CUDA and [cuDNN](https://developer.nvidia.com/rdp/cudnn-download) for GPU inference. See https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html for compatible versions. #### Install Nightly (Optional) -Skip this step if you use onnxruntime-gpu 1.14.* release package. +Skip this step if you use onnxruntime-gpu package from official releases. To try latest optimizations, you can install [ort-nightly-gpu](https://aiinfra.visualstudio.com/PublicPackages/_artifacts/feed/ORT-Nightly/PyPI/ort-nightly-gpu/) package like the following: @@ -101,64 +97,97 @@ pip install ort-nightly-gpu -i https://aiinfra.pkgs.visualstudio.com/PublicPacka ### Setup Environment (ROCm) -It is recommended that the users should run the model with ROCm 5.4 or newer and Python 3.9. Note that Windows is not -supported for ROCm at the moment. +It is recommended that the users run the model with ROCm 5.4 or newer and Python 3.8, 3.9 or 3.10. +Note that Windows is not supported for ROCm at the moment. ``` -conda create -n py39 python=3.9 -conda activate py39 -pip install -r requirements.txt +conda create -n py38 python=3.8 +conda activate py38 +wget https://repo.radeon.com/rocm/manylinux/rocm-rel-5.4/torch-1.12.1%2Brocm5.4-cp38-cp38-linux_x86_64.whl +pip install torch-1.12.1+rocm5.4-cp38-cp38-linux_x86_64.whl pip install -r requirements-rocm.txt ``` -AMD GPU version of torch build can be installed from [AMD Radeon repo](https://repo.radeon.com/rocm/manylinux/rocm-rel-5.4/), -user need to download the whl file and `pip install ` manually. Or directly from https://download.pytorch.org/whl/rocm5.4.2 via +AMD GPU version of PyTorch can be installed from [pytorch.org](https://pytorch.org/get-started/locally/) or [AMD Radeon repo](https://repo.radeon.com/rocm/manylinux/rocm-rel-5.4/). + +#### Install onnxruntime-rocm + +Here is an example to build onnxruntime from source with Rocm 5.4.2 in Ubuntu 20.04, and install the wheel. + +(1) Install [ROCm 5.4.2](https://docs.amd.com/bundle/ROCm-Installation-Guide-v5.4.2/page/How_to_Install_ROCm.html). Note that the version is also used in PyTorch 2.0 ROCm package. + +(2) Install some tools used in build: ``` -pip install torch==2.0.0 --index-url https://download.pytorch.org/whl/rocm5.4.2 +sudo apt-get update +sudo apt-get install -y --no-install-recommends \ + wget \ + zip \ + ca-certificates \ + build-essential \ + curl \ + libcurl4-openssl-dev \ + libssl-dev \ + python3-dev +pip install numpy packaging "wheel>=0.35.1" +wget --quiet https://github.com/Kitware/CMake/releases/download/v3.26.3/cmake-3.26.3-linux-x86_64.tar.gz +tar zxf cmake-3.26.3-linux-x86_64.tar.gz +export PATH=${PWD}/cmake-3.26.3-linux-x86_64/bin:${PATH} ``` -Please follow the [official docs](https://onnxruntime.ai/docs/build/eps.html#amd-rocm) to build ONNXRuntime from source. +(3) Build and Install ONNX Runtime +``` +git clone https://github.com/microsoft/onnxruntime +cd onnxruntime +sh build.sh --config Release --use_rocm --rocm_home /opt/rocm --rocm_version 5.4.2 --build_wheel +pip install build/Linux/Release/dist/*.whl +``` + +You can also follow the [official docs](https://onnxruntime.ai/docs/build/eps.html#amd-rocm) to build with docker. ### Export ONNX pipeline +This step will export stable diffusion 1.5 to ONNX model in float32 using script from diffusers. -This step will export stable diffusion 1.5 to ONNX model in float32 using script from diffusers. Before running the script, you need to be logged in via `huggingface-cli login`. +It is recommended to use PyTorch 1.12.1 or 1.13.1 in this step. Using PyTorch 2.0 will encounter issue in exporting onnx. ``` curl https://raw.githubusercontent.com/huggingface/diffusers/v0.15.1/scripts/convert_stable_diffusion_checkpoint_to_onnx.py > convert_sd_onnx.py -python convert_sd_onnx.py --model_path runwayml/stable-diffusion-v1-5 --output_path ./sd-v1-5 +python convert_sd_onnx.py --model_path runwayml/stable-diffusion-v1-5 --output_path ./sd_v1_5/fp32 ``` ### Optimize ONNX Pipeline Example to optimize the exported float32 ONNX models, and save to float16 models: - ``` -python -m onnxruntime.transformers.models.stable_diffusion.optimize_pipeline -i ./sd-v1-5 -o ./sd-v1-5-fp16 --float16 +python -m onnxruntime.transformers.models.stable_diffusion.optimize_pipeline -i ./sd_v1_5/fp32 -o ./sd_v1_5/fp16 --float16 ``` If you installed ONNX Runtime v1.14, some optimizations (packed QKV and BiasAdd) will be disabled automatically since they are not available in v1.14. -For Stable Diffusion 2.1 model with CUDA EP, you will need force Attention to run in float32 to avoid black image by appending `--force_fp32_ops unet:Attention` to the command line. -If you are using nightly package, append `--force_fp32_ops unet:MultiHeadAttention` instead. - ### Run Benchmark The benchmark.py script will run a warm-up prompt twice, and measure the peak GPU memory usage in these two runs, then record them as first_run_memory_MB and second_run_memory_MB. Then it will run 5 runs to get average latency (in seconds), and output the results to benchmark_result.csv. Note that the first run might need more time and memory: For example, cuDNN convolution algorithm search or model compile happens in the first run. -Example to benchmark the optimized pipeline with batch size 1 on CUDA EP: +To avoid black image output for Stable Diffusion 2.1 with CUDA EP, we can set an environment variable before inferencing: ``` -python -m onnxruntime.transformers.models.stable_diffusion.benchmark -p ./sd-v1-5-fp16/ -b 1 +export ORT_DISABLE_TRT_FLASH_ATTENTION=1 +``` + +Before running benchmark on PyTorch, you need to be logged in via `huggingface-cli login` once. + +Example to benchmark the optimized pipeline of stable diffusion 1.5 with batch size 1 on CUDA EP: +``` +python benchmark.py -p ./sd_v1_5/fp16 -b 1 -v 1.5 ``` On ROCm EP, use the following command instead: ``` -python -m onnxruntime.transformers.models.stable_diffusion.benchmark -p ./sd-v1-5-fp16/ -b 1 --tuning --provider=rocm +python benchmark.py -p ./sd_v1_5/fp16 -b 1 --tuning --provider rocm -v 1.5 ``` -Note: you can substitute `python -m onnxruntime.transformers.models.stable_diffusion.benchmark` with `python benchmark.py` if your current working directory is this files directory. -In the following, we will use it interchangeably. +For ROCm EP, you can substitute `python benchmark.py` with `python -m onnxruntime.transformers.models.stable_diffusion.benchmark` since +the installed package is built from source. For CUDA, it is recommended to run `python benchmark.py` with the latest benchmark script. For ROCm EP, the `--tuning` is mandatory because we heavily rely on tuning to find the runable kernels for ORT `OpKernel`s. @@ -169,20 +198,22 @@ The default parameters are stable diffusion version=1.5, height=512, width=512, Run PyTorch 1.13.1+cu117 with xFormers like the following ``` -python benchmark.py -e torch -b 1 --use_xformers +pip install xformers==0.0.16 +python benchmark.py -e torch -b 1 --use_xformers -v 1.5 ``` ### Run Benchmark with PyTorch 2.0 with torch.compile -Let's create a new environment to run PyTorch 2.0: - +For CUDA: ``` -conda create -n pt2 python=3.10 -conda activate pt2 -pip install torch --index-url https://download.pytorch.org/whl/cu117 -pip install -r requirements.txt -pip install -r requirements_cuda.txt # or requirements_rocm.txt -python benchmark.py -e torch -b 1 --enable_torch_compile +pip install torch --upgrade --index-url https://download.pytorch.org/whl/cu117 +python benchmark.py -e torch -b 1 --enable_torch_compile -v 1.5 +``` + +For ROCm: +``` +pip install torch --upgrade --index-url https://download.pytorch.org/whl/rocm5.4.2 +python benchmark.py -e torch -b 1 --enable_torch_compile --provider rocm -v 1.5 ``` Sometime, it complains ptxas not found when there are multiple CUDA versions installed. It can be fixed like `export TRITON_PTXAS_PATH=/usr/local/cuda-11.7/bin/ptxas` before running benchmark. @@ -273,15 +304,15 @@ Results are from Standard_NC4as_T4_v3 Azure virtual machine: | engine | version | provider | batch size | average latency | first run memory MB | second run memory MB | | ----------- | ----------------------- | --------------------- | ---------- | --------------- | ------------------- | -------------------- | -| onnxruntime | dev | ROCMExecutionProvider | 1 | 2.2 | 5,548 | 4,908 | +| onnxruntime | 1.15.0+rocm5.4.2 | ROCMExecutionProvider | 1 | 2.2 | 5,548 | 4,908 | | torch | 1.12.1+rocm5.4 | - | 1 | 3.4 | 6,653 | 4,613 | | torch | 2.0.0+rocm5.4.2 | default | 1 | 3.2 | 5,977 | 4,368 | | torch | 2.0.0+rocm5.4.2 | compile | 1 | 3.0 | 5,869 | 4,266 | -| onnxruntime | dev | ROCMExecutionProvider | 4 | 6.6 | 5,546 | 4,906 | +| onnxruntime | 1.15.0+rocm5.4.2 | ROCMExecutionProvider | 4 | 6.6 | 5,546 | 4,906 | | torch | 1.12.1+rocm5.4 | - | 4 | 10.1 | 19,477 | 11,325 | | torch | 2.0.0+rocm5.4.2 | default | 4 | 10.5 | 13,051 | 7,300 | | torch | 2.0.0+rocm5.4.2 | compile | 4 | 9.2 | 12,879 | 7,190 | -| onnxruntime | dev | ROCMExecutionProvider | 8 | 12.5 | 9,778 | 9,006 | +| onnxruntime | 1.15.0+rocm5.4.2 | ROCMExecutionProvider | 8 | 12.5 | 9,778 | 9,006 | | torch | 1.12.1+rocm5.4 | - | 8 | 19.3 | 55,851 | 20,014 | | torch | 2.0.0+rocm5.4.2 | default | 8 | 20.3 | 23,551 | 11,930 | | torch | 2.0.0+rocm5.4.2 | compile | 8 | 17.8 | 23,303 | 11,800 | @@ -290,15 +321,15 @@ Results are from Standard_NC4as_T4_v3 Azure virtual machine: | engine | version | provider | batch size | average latency | first run memory MB | second run memory MB | | ----------- | ----------------------- | --------------------- | ---------- | --------------- | ------------------- | -------------------- | -| onnxruntime | dev | ROCMExecutionProvider | 1 | 2.4 | 5,254 | 4,614 | +| onnxruntime | 1.15.0+rocm5.4.2 | ROCMExecutionProvider | 1 | 2.4 | 5,254 | 4,614 | | torch | 1.12.1+rocm5.4 | - | 1 | 3.5 | 5,771 | 4,672 | | torch | 2.0.0+rocm5.4.2 | default | 1 | 3.5 | 5,811 | 4,206 | | torch | 2.0.0+rocm5.4.2 | compile | 1 | 3.1 | 5,774 | 4,168 | -| onnxruntime | dev | ROCMExecutionProvider | 4 | 7.5 | 7,290 | 6,646 | +| onnxruntime | 1.15.0+rocm5.4.2 | ROCMExecutionProvider | 4 | 7.5 | 7,290 | 6,646 | | torch | 1.12.1+rocm5.4 | - | 4 | 10.7 | 19,334 | 11,181 | | torch | 2.0.0+rocm5.4.2 | default | 4 | 11.5 | 12,881 | 7,151 | | torch | 2.0.0+rocm5.4.2 | compile | 4 | 10.0 | 12,740 | 7,073 | -| onnxruntime | dev | ROCMExecutionProvider | 8 | 14.4 | 7,320 | 6,676 | +| onnxruntime | 1.15.0+rocm5.4.2 | ROCMExecutionProvider | 8 | 14.4 | 7,320 | 6,676 | | torch | 1.12.1+rocm5.4 | - | 8 | 20.2 | 31,820 | 19,908 | | torch | 2.0.0+rocm5.4.2 | default | 8 | 22.2 | 23,415 | 11,815 | | torch | 2.0.0+rocm5.4.2 | compile | 8 | 19.3 | 23,154 | 11,667 | diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py index 1226c3bfab..4e00ded9e3 100755 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py @@ -641,6 +641,11 @@ def main(): if args.engine == "onnxruntime": assert args.pipeline, "--pipeline should be specified for onnxruntime engine" + if args.version in ["2.1"]: + # Set a flag to avoid overflow in attention, which causes black image output in SD 2.1 model + # This shall be done before the first inference run. + os.environ["ORT_DISABLE_TRT_FLASH_ATTENTION"] = "1" + result = run_ort( sd_model, args.pipeline, diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/optimize_pipeline.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/optimize_pipeline.py index c74418f58a..5e8a25eb0b 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/optimize_pipeline.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/optimize_pipeline.py @@ -131,9 +131,9 @@ def optimize_sd_pipeline( if model_type in ["unet"]: # Some optimizations are not available in v1.14 or older version: packed QKV and BiasAdd has_all_optimizations = version.parse(onnxruntime.__version__) >= version.parse("1.15.0") - fusion_options.enable_packed_kv = float16 - fusion_options.enable_packed_qkv = float16 and has_all_optimizations - fusion_options.enable_bias_add = has_all_optimizations + fusion_options.enable_packed_kv = float16 and fusion_options.enable_packed_kv + fusion_options.enable_packed_qkv = float16 and has_all_optimizations and fusion_options.enable_packed_qkv + fusion_options.enable_bias_add = has_all_optimizations and fusion_options.enable_bias_add m = optimize_model( str(onnx_model_path), diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-cuda.txt b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-cuda.txt index d96199fc2f..18852f515a 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-cuda.txt +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-cuda.txt @@ -1,7 +1,6 @@ -# Install the following package in python 3.10 +-r requirements.txt onnxruntime-gpu>=1.14 py3nvml==0.2.7 -xformers==0.0.16 -#For Windows, need install PyTorch 1.13.1+cu117 since torch in pypi is CPU version +#To export onnx of stable diffusion, please install PyTorch 1.13.1+cu117 #--extra-index-url https://download.pytorch.org/whl/cu117 #torch==1.13.1+cu117 diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-rocm.txt b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-rocm.txt index ada3db3020..c0a925e25b 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-rocm.txt +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-rocm.txt @@ -1,3 +1,4 @@ +-r requirements.txt # Install onnxruntime-rocm or onnxruntime_training # Build onnxruntime-rocm from source # Directly install pre-built onnxruntime/onnxruntime-training rocm python package is not possible at the moment.