From 42d62b8f2b69f9cdfd8a650a9e330b2c547fe20c Mon Sep 17 00:00:00 2001
From: Ted Themistokleous
 <107195283+TedThemistokleous@users.noreply.github.com>
Date: Sat, 6 May 2023 05:35:21 -0400
Subject: [PATCH] Fixes to get stable diffusion benchmark running (#15755)

### Description

Added changes to MIGraphX EP to suppoert stable diffusion

1. Added parameterized input dimensions to not trigger a precompile to
set input parameters in the EP
2. Removed input checking for Resize operator in EP as MIGraphX already
performs these checks
3. Add support to benchmark script to use the MIGraphX execution
provider
4. Add support for an odd valued batch size (3) that was seen on other
benchmarks we were performing comparison on.

### Motivation and Context

These changes are required to get stable diffusion mdoels to run on
MIGraphX through the EP. Without these changes we see the following
incorrect behavior.

1. Resize operators are pushed onto the CPU EP instead of MIGraphX,
causing a significant slowdown during runs
2. Precompile operations incorrectly parse input_ids parameter for our
text model, with a 1, which breaks during MIGraphX Compile of onnx. This
in turn throws an error and stops any setup before inference.
3. Selecting the correct EP in the benchmark script which was previously
missing the MIGraphX option
5. Suppressed an error we keep seeing with pthread_set_affinity - this
is a quality of life change when using the MIGraphX EP

This was testing with the benchmark.py script using stable diffusion v2
located in

onnxruntime/onnxruntime/python/tools/transformers/models/stable_diffusion/

---------

Co-authored-by: Ted Themistokleous <tthemist@amd.com>
---
 onnxruntime/core/platform/posix/env.cc        |  2 ++
 .../migraphx/migraphx_execution_provider.cc   | 22 +++++++++----------
 .../models/stable_diffusion/benchmark.py      |  3 ++-
 3 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/onnxruntime/core/platform/posix/env.cc b/onnxruntime/core/platform/posix/env.cc
index a5aa5773a3..14217a126d 100644
--- a/onnxruntime/core/platform/posix/env.cc
+++ b/onnxruntime/core/platform/posix/env.cc
@@ -248,11 +248,13 @@ class PosixThread : public EnvThread {
                                 << ", mask: " << *p->affinity;
         } else {
           auto [err_no, err_msg] = GetSystemError(ret);
+#if !defined(USE_MIGRAPHX)
           LOGS_DEFAULT(ERROR) << "pthread_setaffinity_np failed for thread: " << syscall(SYS_gettid)
                               << ", index: " << p->index
                               << ", mask: " << *p->affinity
                               << ", error code: " << err_no << " error msg: " << err_msg
                               << ". Specify the number of threads explicitly so the affinity is not set.";
+#endif
         }
       }
 #endif
diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
index ba98a79a72..7cf8e4759f 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
+++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
@@ -451,15 +451,6 @@ static bool IsUnsupportedOpMode(const onnxruntime::GraphViewer& graph_viewer, co
       }
     }
 
-    const auto& args = node->InputDefs();
-    if (args.size() > 1) {
-      std::vector<std::size_t> indices(args.size() - 1);
-      std::iota(indices.begin(), indices.end(), 1);
-      if (canEvalNodeArgument(graph_viewer, node, indices, input_nodes)) {
-        return false;
-      }
-      return true;
-    }
   } else if (optype == "ReduceSum") {
     const auto& args = node->InputDefs();
     if (args.size() == 2) {
@@ -952,8 +943,15 @@ bool get_input_output_names(const GraphViewer& graph,
     if (sptr == nullptr)
       return true;
 
-    auto dim_size = sptr->dim_size();
-    return (dim_size == 0);
+    if (sptr->dim_size() == 0)
+      return true;
+
+    for (int i = 0; i < sptr->dim_size(); i++) {
+      if (sptr->dim(i).has_dim_param())
+        return true;
+    }
+
+    return false;
   });
 
   const auto& out_args = graph.GetOutputs();
@@ -1002,7 +1000,7 @@ Status MIGraphXExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>&
     }
 
     std::vector<std::string> input_names, output_names;
-    no_input_shape = no_input_shape or get_input_output_names(graph_body_viewer, input_names, output_names);
+    no_input_shape = get_input_output_names(graph_body_viewer, input_names, output_names);
 
     // by parsing the model_proto, create a program corresponding to
     // the input fused_node
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py
index e6178f3c5d..1226c3bfab 100755
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py
@@ -19,6 +19,7 @@ SD_MODELS = {
 PROVIDERS = {
     "cuda": "CUDAExecutionProvider",
     "rocm": "ROCMExecutionProvider",
+    "migraphx": "MIGraphXExecutionProvider",
 }
 
 
@@ -570,7 +571,7 @@ def parse_arguments():
         "--batch_size",
         type=int,
         default=1,
-        choices=[1, 2, 4, 8, 10, 16, 32],
+        choices=[1, 2, 3, 4, 8, 10, 16, 32],
         help="Number of images per batch. Default is 1.",
     )