From 42d62b8f2b69f9cdfd8a650a9e330b2c547fe20c Mon Sep 17 00:00:00 2001 From: Ted Themistokleous <107195283+TedThemistokleous@users.noreply.github.com> Date: Sat, 6 May 2023 05:35:21 -0400 Subject: [PATCH] Fixes to get stable diffusion benchmark running (#15755) ### Description Added changes to MIGraphX EP to suppoert stable diffusion 1. Added parameterized input dimensions to not trigger a precompile to set input parameters in the EP 2. Removed input checking for Resize operator in EP as MIGraphX already performs these checks 3. Add support to benchmark script to use the MIGraphX execution provider 4. Add support for an odd valued batch size (3) that was seen on other benchmarks we were performing comparison on. ### Motivation and Context These changes are required to get stable diffusion mdoels to run on MIGraphX through the EP. Without these changes we see the following incorrect behavior. 1. Resize operators are pushed onto the CPU EP instead of MIGraphX, causing a significant slowdown during runs 2. Precompile operations incorrectly parse input_ids parameter for our text model, with a 1, which breaks during MIGraphX Compile of onnx. This in turn throws an error and stops any setup before inference. 3. Selecting the correct EP in the benchmark script which was previously missing the MIGraphX option 5. Suppressed an error we keep seeing with pthread_set_affinity - this is a quality of life change when using the MIGraphX EP This was testing with the benchmark.py script using stable diffusion v2 located in onnxruntime/onnxruntime/python/tools/transformers/models/stable_diffusion/ --------- Co-authored-by: Ted Themistokleous --- onnxruntime/core/platform/posix/env.cc | 2 ++ .../migraphx/migraphx_execution_provider.cc | 22 +++++++++---------- .../models/stable_diffusion/benchmark.py | 3 ++- 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/onnxruntime/core/platform/posix/env.cc b/onnxruntime/core/platform/posix/env.cc index a5aa5773a3..14217a126d 100644 --- a/onnxruntime/core/platform/posix/env.cc +++ b/onnxruntime/core/platform/posix/env.cc @@ -248,11 +248,13 @@ class PosixThread : public EnvThread { << ", mask: " << *p->affinity; } else { auto [err_no, err_msg] = GetSystemError(ret); +#if !defined(USE_MIGRAPHX) LOGS_DEFAULT(ERROR) << "pthread_setaffinity_np failed for thread: " << syscall(SYS_gettid) << ", index: " << p->index << ", mask: " << *p->affinity << ", error code: " << err_no << " error msg: " << err_msg << ". Specify the number of threads explicitly so the affinity is not set."; +#endif } } #endif diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc index ba98a79a72..7cf8e4759f 100644 --- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc +++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc @@ -451,15 +451,6 @@ static bool IsUnsupportedOpMode(const onnxruntime::GraphViewer& graph_viewer, co } } - const auto& args = node->InputDefs(); - if (args.size() > 1) { - std::vector indices(args.size() - 1); - std::iota(indices.begin(), indices.end(), 1); - if (canEvalNodeArgument(graph_viewer, node, indices, input_nodes)) { - return false; - } - return true; - } } else if (optype == "ReduceSum") { const auto& args = node->InputDefs(); if (args.size() == 2) { @@ -952,8 +943,15 @@ bool get_input_output_names(const GraphViewer& graph, if (sptr == nullptr) return true; - auto dim_size = sptr->dim_size(); - return (dim_size == 0); + if (sptr->dim_size() == 0) + return true; + + for (int i = 0; i < sptr->dim_size(); i++) { + if (sptr->dim(i).has_dim_param()) + return true; + } + + return false; }); const auto& out_args = graph.GetOutputs(); @@ -1002,7 +1000,7 @@ Status MIGraphXExecutionProvider::Compile(const std::vector& } std::vector input_names, output_names; - no_input_shape = no_input_shape or get_input_output_names(graph_body_viewer, input_names, output_names); + no_input_shape = get_input_output_names(graph_body_viewer, input_names, output_names); // by parsing the model_proto, create a program corresponding to // the input fused_node diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py index e6178f3c5d..1226c3bfab 100755 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py @@ -19,6 +19,7 @@ SD_MODELS = { PROVIDERS = { "cuda": "CUDAExecutionProvider", "rocm": "ROCMExecutionProvider", + "migraphx": "MIGraphXExecutionProvider", } @@ -570,7 +571,7 @@ def parse_arguments(): "--batch_size", type=int, default=1, - choices=[1, 2, 4, 8, 10, 16, 32], + choices=[1, 2, 3, 4, 8, 10, 16, 32], help="Number of images per batch. Default is 1.", )