mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-07-01 03:45:06 +00:00
Add option to force QDQIsInt8Allowed to return true when exporting to ORT format (#10719)
* wip
* save
* minor update
* fix
* fix
* Revert "fix"
This reverts commit a76f364b2d.
* revert
* revert
* revert submodule removal
* address pr comments
* minor fix
* address cr comments
* fix format
Co-authored-by: rachguo <rachguo@rachguos-Mini.attlocal.net>
This commit is contained in:
parent
44d08d80a0
commit
a9dc50ba8b
5 changed files with 31 additions and 9 deletions
|
|
@ -78,6 +78,10 @@ static const char* const kOrtSessionOptionsConfigAllowIntraOpSpinning = "session
|
|||
// has to guarantee that the model bytes are valid until the ORT session using the model bytes is destroyed.
|
||||
static const char* const kOrtSessionOptionsConfigUseORTModelBytesDirectly = "session.use_ort_model_bytes_directly";
|
||||
|
||||
// This should only be specified when exporting an ORT format model for use on a different platform.
|
||||
// If the ORT format model will be used on ARM platforms set to "1". For other platforms set to "0"
|
||||
static const char* const kOrtSessionOptionsQDQIsInt8Allowed = "session.qdqisint8allowed";
|
||||
|
||||
// Save information for replaying graph optimizations later instead of applying them directly.
|
||||
//
|
||||
// When an ONNX model is loaded, ORT can perform various optimizations on the graph.
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@
|
|||
|
||||
#include "core/optimizer/conv_activation_fusion.h"
|
||||
#include "core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.h"
|
||||
#include "core/optimizer/selectors_actions/selector_action_transformer_apply_contexts.h"
|
||||
#include "core/session/onnxruntime_session_options_config_keys.h"
|
||||
|
||||
#if !defined(ORT_MINIMAL_BUILD)
|
||||
|
|
@ -161,6 +162,8 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
|
|||
session_options.config_options.GetConfigOrDefault(kOrtSessionOptionsDisableQuantQDQ, "0") == "1";
|
||||
const bool enable_quant_qdq_cleanup =
|
||||
session_options.config_options.GetConfigOrDefault(kOrtSessionOptionsEnableQuantQDQCleanup, "0") == "1";
|
||||
const bool qdq_is_int8_allowed =
|
||||
session_options.config_options.GetConfigOrDefault(kOrtSessionOptionsQDQIsInt8Allowed, QDQIsInt8Allowed() ? "1" : "0") == "1";
|
||||
#ifndef DISABLE_CONTRIB_OPS
|
||||
const bool enable_gelu_approximation =
|
||||
session_options.config_options.GetConfigOrDefault(kOrtSessionOptionsEnableGeluApproximation, "0") == "1";
|
||||
|
|
@ -208,10 +211,13 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
|
|||
onnxruntime::kArmNNExecutionProvider};
|
||||
|
||||
if (!disable_quant_qdq) {
|
||||
if (!QDQIsInt8Allowed()) {
|
||||
// currently we don't support QDQS8ToU8Transformer in a minimal build and if supported, this needs to run in
|
||||
// Level 1 during export and not Level 2 at runtime as it would result in overlapping optimizations which
|
||||
// runtime optimization does not support, so add session config value here to force qdqisint8allowed to be true.
|
||||
if (!qdq_is_int8_allowed) {
|
||||
transformers.emplace_back(std::make_unique<QDQS8ToU8Transformer>(cpu_ep));
|
||||
}
|
||||
transformers.emplace_back(std::make_unique<QDQSelectorActionTransformer>());
|
||||
transformers.emplace_back(std::make_unique<QDQSelectorActionTransformer>(SatApplyContextVariant{}, qdq_is_int8_allowed));
|
||||
}
|
||||
|
||||
transformers.emplace_back(std::make_unique<GemmActivationFusion>(cpu_ep));
|
||||
|
|
|
|||
|
|
@ -208,10 +208,10 @@ SelectorActionRegistry CreateSelectorActionRegistry(bool is_int8_allowed) {
|
|||
} // namespace
|
||||
|
||||
QDQSelectorActionTransformer::QDQSelectorActionTransformer(
|
||||
const SatApplyContextVariant& apply_context)
|
||||
const SatApplyContextVariant& apply_context, bool is_int8_allowed)
|
||||
: SelectorActionTransformer{
|
||||
"QDQSelectorActionTransformer",
|
||||
CreateSelectorActionRegistry(QDQIsInt8Allowed()),
|
||||
CreateSelectorActionRegistry(is_int8_allowed),
|
||||
apply_context,
|
||||
// this transformer is only compatible with the CPU EP
|
||||
{kCpuExecutionProvider}} {
|
||||
|
|
|
|||
|
|
@ -18,11 +18,11 @@ inline constexpr bool QDQIsInt8Allowed(){
|
|||
|
||||
|
||||
/**
|
||||
Transformer that fuses QDQ and fp32 ops into quantized ops.
|
||||
Transformer that fuses QDQ and fp32 ops into quantized ops.
|
||||
*/
|
||||
class QDQSelectorActionTransformer : public SelectorActionTransformer {
|
||||
public:
|
||||
QDQSelectorActionTransformer(const SatApplyContextVariant& apply_context = {});
|
||||
QDQSelectorActionTransformer(const SatApplyContextVariant& apply_context = {}, bool is_int8_allowed = false);
|
||||
};
|
||||
|
||||
} // namespace onnxruntime
|
||||
|
|
|
|||
|
|
@ -62,7 +62,7 @@ def _create_session_options(optimization_level: ort.GraphOptimizationLevel,
|
|||
|
||||
def _convert(model_path_or_dir: pathlib.Path, optimization_level_str: str, use_nnapi: bool, use_coreml: bool,
|
||||
custom_op_library: pathlib.Path, create_optimized_onnx_model: bool, allow_conversion_failures: bool,
|
||||
session_options_config_entries: typing.Dict[str, str]):
|
||||
target_platform: str, session_options_config_entries: typing.Dict[str, str]):
|
||||
|
||||
optimization_level = get_optimization_level(optimization_level_str)
|
||||
|
||||
|
|
@ -91,7 +91,7 @@ def _convert(model_path_or_dir: pathlib.Path, optimization_level_str: str, use_n
|
|||
# If someone really really really wants to run it they could manually create an optimized onnx model first,
|
||||
# or they could comment out this code.
|
||||
optimizer_filter = None
|
||||
if optimization_level == ort.GraphOptimizationLevel.ORT_ENABLE_ALL:
|
||||
if optimization_level == ort.GraphOptimizationLevel.ORT_ENABLE_ALL and target_platform != 'amd64':
|
||||
optimizer_filter = ['NchwcTransformer']
|
||||
|
||||
num_failures = 0
|
||||
|
|
@ -198,6 +198,12 @@ def parse_args():
|
|||
'In particular, specify the value of the "ep.nnapi.partitioning_stop_ops" session '
|
||||
'options config entry.')
|
||||
|
||||
parser.add_argument('--target_platform', type=str, default=None, choices=['arm', 'amd64'],
|
||||
help='Specify the target platform where the exported model will be used. '
|
||||
'This parameter can be used to choose between platform specific options, '
|
||||
'such as QDQIsInt8Allowed(arm), NCHWc (amd64) and NHWC (arm/amd64) format different '
|
||||
'optimizer level options,etc.')
|
||||
|
||||
parser.add_argument('model_path_or_dir', type=pathlib.Path,
|
||||
help='Provide path to ONNX model or directory containing ONNX model/s to convert. '
|
||||
'All files with a .onnx extension, including in subdirectories, will be processed.')
|
||||
|
|
@ -228,10 +234,16 @@ def convert_onnx_models_to_ort():
|
|||
if args.nnapi_partitioning_stop_ops is not None:
|
||||
session_options_config_entries["ep.nnapi.partitioning_stop_ops"] = args.nnapi_partitioning_stop_ops
|
||||
|
||||
if args.target_platform == 'arm':
|
||||
session_options_config_entries["session.qdqisint8allowed"] = "1"
|
||||
else:
|
||||
session_options_config_entries["session.qdqisint8allowed"] = "0"
|
||||
|
||||
for optimization_level in args.optimization_level:
|
||||
print(f"Converting models and creating configuration file for optimization level '{optimization_level}'")
|
||||
_convert(model_path_or_dir, optimization_level, args.use_nnapi, args.use_coreml, custom_op_library,
|
||||
args.save_optimized_onnx_model, args.allow_conversion_failures, session_options_config_entries)
|
||||
args.save_optimized_onnx_model, args.allow_conversion_failures, args.target_platform,
|
||||
session_options_config_entries)
|
||||
|
||||
_create_config_file_from_ort_models(model_path_or_dir, optimization_level, args.enable_type_reduction)
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue