From a9dc50ba8b324e4f653eff2b172421d8b2e1b3c3 Mon Sep 17 00:00:00 2001
From: Rachel Guo <35738743+YUNQIUGUO@users.noreply.github.com>
Date: Wed, 2 Mar 2022 23:26:14 -0800
Subject: [PATCH] Add option to force QDQIsInt8Allowed to return true when
 exporting to ORT format (#10719)

* wip

* save

* minor update

* fix

* fix

* Revert "fix"

This reverts commit a76f364b2d9b4b0967d8d8c852d5cc05f02e5360.

* revert

* revert

* revert submodule removal

* address pr comments

* minor fix

* address cr comments

* fix format

Co-authored-by: rachguo <rachguo@rachguos-Mini.attlocal.net>
---
 .../onnxruntime_session_options_config_keys.h  |  4 ++++
 .../core/optimizer/graph_transformer_utils.cc  | 10 ++++++++--
 .../qdq_selector_action_transformer.cc         |  4 ++--
 .../qdq_selector_action_transformer.h          |  4 ++--
 .../python/util/convert_onnx_models_to_ort.py  | 18 +++++++++++++++---
 5 files changed, 31 insertions(+), 9 deletions(-)
diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
index 82fe74cf76..457d20f9a1 100644
--- a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
+++ b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
@@ -78,6 +78,10 @@ static const char* const kOrtSessionOptionsConfigAllowIntraOpSpinning = "session
 // has to guarantee that the model bytes are valid until the ORT session using the model bytes is destroyed.
 static const char* const kOrtSessionOptionsConfigUseORTModelBytesDirectly = "session.use_ort_model_bytes_directly";
 
+// This should only be specified when exporting an ORT format model for use on a different platform.
+// If the ORT format model will be used on ARM platforms set to "1". For other platforms set to "0"
+static const char* const kOrtSessionOptionsQDQIsInt8Allowed = "session.qdqisint8allowed";
+
 // Save information for replaying graph optimizations later instead of applying them directly.
 //
 // When an ONNX model is loaded, ORT can perform various optimizations on the graph.
diff --git a/onnxruntime/core/optimizer/graph_transformer_utils.cc b/onnxruntime/core/optimizer/graph_transformer_utils.cc
index 93146ad459..c9bfddb56f 100644
--- a/onnxruntime/core/optimizer/graph_transformer_utils.cc
+++ b/onnxruntime/core/optimizer/graph_transformer_utils.cc
@@ -7,6 +7,7 @@
 
 #include "core/optimizer/conv_activation_fusion.h"
 #include "core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.h"
+#include "core/optimizer/selectors_actions/selector_action_transformer_apply_contexts.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
 
 #if !defined(ORT_MINIMAL_BUILD)
@@ -161,6 +162,8 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
       session_options.config_options.GetConfigOrDefault(kOrtSessionOptionsDisableQuantQDQ, "0") == "1";
   const bool enable_quant_qdq_cleanup =
       session_options.config_options.GetConfigOrDefault(kOrtSessionOptionsEnableQuantQDQCleanup, "0") == "1";
+  const bool qdq_is_int8_allowed =
+      session_options.config_options.GetConfigOrDefault(kOrtSessionOptionsQDQIsInt8Allowed, QDQIsInt8Allowed() ? "1" : "0") == "1";
 #ifndef DISABLE_CONTRIB_OPS
   const bool enable_gelu_approximation =
       session_options.config_options.GetConfigOrDefault(kOrtSessionOptionsEnableGeluApproximation, "0") == "1";
@@ -208,10 +211,13 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
                                                                             onnxruntime::kArmNNExecutionProvider};
 
       if (!disable_quant_qdq) {
-        if (!QDQIsInt8Allowed()) {
+        // currently we don't support QDQS8ToU8Transformer in a minimal build and if supported, this needs to run in
+        // Level 1 during export and not Level 2 at runtime as it would result in overlapping optimizations which
+        // runtime optimization does not support, so add session config value here to force qdqisint8allowed to be true.
+        if (!qdq_is_int8_allowed) {
           transformers.emplace_back(std::make_unique<QDQS8ToU8Transformer>(cpu_ep));
         }
-        transformers.emplace_back(std::make_unique<QDQSelectorActionTransformer>());
+        transformers.emplace_back(std::make_unique<QDQSelectorActionTransformer>(SatApplyContextVariant{}, qdq_is_int8_allowed));
       }
 
       transformers.emplace_back(std::make_unique<GemmActivationFusion>(cpu_ep));
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc
index e137ba3f94..694c2cab6b 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc
@@ -208,10 +208,10 @@ SelectorActionRegistry CreateSelectorActionRegistry(bool is_int8_allowed) {
 }  // namespace
 
 QDQSelectorActionTransformer::QDQSelectorActionTransformer(
-    const SatApplyContextVariant& apply_context)
+    const SatApplyContextVariant& apply_context, bool is_int8_allowed)
     : SelectorActionTransformer{
           "QDQSelectorActionTransformer",
-          CreateSelectorActionRegistry(QDQIsInt8Allowed()),
+          CreateSelectorActionRegistry(is_int8_allowed),
           apply_context,
           // this transformer is only compatible with the CPU EP
           {kCpuExecutionProvider}} {
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.h b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.h
index d8b265d420..2c48109de1 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.h
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.h
@@ -18,11 +18,11 @@ inline constexpr bool QDQIsInt8Allowed(){
 
 
 /**
-Transformer that fuses QDQ and fp32 ops into quantized ops. 
+Transformer that fuses QDQ and fp32 ops into quantized ops.
 */
 class QDQSelectorActionTransformer : public SelectorActionTransformer {
  public:
-  QDQSelectorActionTransformer(const SatApplyContextVariant& apply_context = {});
+  QDQSelectorActionTransformer(const SatApplyContextVariant& apply_context = {}, bool is_int8_allowed = false);
 };
 
 }  // namespace onnxruntime
diff --git a/tools/python/util/convert_onnx_models_to_ort.py b/tools/python/util/convert_onnx_models_to_ort.py
index e057857054..48c7d6552b 100644
--- a/tools/python/util/convert_onnx_models_to_ort.py
+++ b/tools/python/util/convert_onnx_models_to_ort.py
@@ -62,7 +62,7 @@ def _create_session_options(optimization_level: ort.GraphOptimizationLevel,
 
 def _convert(model_path_or_dir: pathlib.Path, optimization_level_str: str, use_nnapi: bool, use_coreml: bool,
              custom_op_library: pathlib.Path, create_optimized_onnx_model: bool, allow_conversion_failures: bool,
-             session_options_config_entries: typing.Dict[str, str]):
+             target_platform: str, session_options_config_entries: typing.Dict[str, str]):
 
     optimization_level = get_optimization_level(optimization_level_str)
 
@@ -91,7 +91,7 @@ def _convert(model_path_or_dir: pathlib.Path, optimization_level_str: str, use_n
     # If someone really really really wants to run it they could manually create an optimized onnx model first,
     # or they could comment out this code.
     optimizer_filter = None
-    if optimization_level == ort.GraphOptimizationLevel.ORT_ENABLE_ALL:
+    if optimization_level == ort.GraphOptimizationLevel.ORT_ENABLE_ALL and target_platform != 'amd64':
         optimizer_filter = ['NchwcTransformer']
 
     num_failures = 0
@@ -198,6 +198,12 @@ def parse_args():
                              'In particular, specify the value of the "ep.nnapi.partitioning_stop_ops" session '
                              'options config entry.')
 
+    parser.add_argument('--target_platform', type=str, default=None, choices=['arm', 'amd64'],
+                        help='Specify the target platform where the exported model will be used. '
+                             'This parameter can be used to choose between platform specific options, '
+                             'such as QDQIsInt8Allowed(arm), NCHWc (amd64) and NHWC (arm/amd64) format different '
+                             'optimizer level options,etc.')
+
     parser.add_argument('model_path_or_dir', type=pathlib.Path,
                         help='Provide path to ONNX model or directory containing ONNX model/s to convert. '
                              'All files with a .onnx extension, including in subdirectories, will be processed.')
@@ -228,10 +234,16 @@ def convert_onnx_models_to_ort():
     if args.nnapi_partitioning_stop_ops is not None:
         session_options_config_entries["ep.nnapi.partitioning_stop_ops"] = args.nnapi_partitioning_stop_ops
 
+    if args.target_platform == 'arm':
+        session_options_config_entries["session.qdqisint8allowed"] = "1"
+    else:
+        session_options_config_entries["session.qdqisint8allowed"] = "0"
+
     for optimization_level in args.optimization_level:
         print(f"Converting models and creating configuration file for optimization level '{optimization_level}'")
         _convert(model_path_or_dir, optimization_level, args.use_nnapi, args.use_coreml, custom_op_library,
-                 args.save_optimized_onnx_model, args.allow_conversion_failures, session_options_config_entries)
+                 args.save_optimized_onnx_model, args.allow_conversion_failures, args.target_platform,
+                 session_options_config_entries)
 
         _create_config_file_from_ort_models(model_path_or_dir, optimization_level, args.enable_type_reduction)