diff --git a/onnxruntime/core/optimizer/graph_transformer_utils.cc b/onnxruntime/core/optimizer/graph_transformer_utils.cc index 3fd9434726..84a9819d55 100644 --- a/onnxruntime/core/optimizer/graph_transformer_utils.cc +++ b/onnxruntime/core/optimizer/graph_transformer_utils.cc @@ -146,14 +146,23 @@ std::vector> GenerateTransformers( const IExecutionProvider& cpu_execution_provider, /*required by constant folding*/ const std::unordered_set& rules_and_transformers_to_disable) { std::vector> transformers; - std::unique_ptr rule_transformer = nullptr; - bool disable_quant_qdq = session_options.config_options.GetConfigOrDefault(kOrtSessionOptionsDisableQuantQDQ, "0") == "1"; + bool disable_quant_qdq = + session_options.config_options.GetConfigOrDefault(kOrtSessionOptionsDisableQuantQDQ, "0") == "1"; #ifndef DISABLE_CONTRIB_OPS - bool enable_gelu_approximation = session_options.config_options.GetConfigOrDefault(kOrtSessionOptionsEnableGeluApproximation, "0") == "1"; + bool enable_gelu_approximation = + session_options.config_options.GetConfigOrDefault(kOrtSessionOptionsEnableGeluApproximation, "0") == "1"; #endif switch (level) { case TransformerLevel::Level1: { + // RewriteRule optimizations are the simplest (they generally remove unnecessary nodes and are cheap to run) + // so run them first so there is potentially less for the more intensive optimizations like ConstantFolding, + // CommonSubexpressionElimination and TransposeOptimizer to do. + auto rule_transformer = GenerateRuleBasedGraphTransformer(level, rules_and_transformers_to_disable, {}); + if (rule_transformer != nullptr) { + transformers.emplace_back(std::move(rule_transformer)); + } + // no filtering on execution provider for L1 optimizations as they only use official ONNX operators transformers.emplace_back(std::make_unique()); transformers.emplace_back(std::make_unique(cpu_execution_provider, !disable_quant_qdq)); @@ -163,16 +172,11 @@ std::vector> GenerateTransformers( session_options.free_dimension_overrides)); auto cpu_allocator = cpu_execution_provider.GetAllocator(0, OrtMemTypeDefault); transformers.emplace_back(std::make_unique(std::move(cpu_allocator))); - - rule_transformer = GenerateRuleBasedGraphTransformer(level, rules_and_transformers_to_disable, {}); } break; case TransformerLevel::Level2: { std::unordered_set cpu_ep = {onnxruntime::kCpuExecutionProvider}; - // create rule based transformer consisting of all the level2 rewrite rules - rule_transformer = GenerateRuleBasedGraphTransformer(level, rules_and_transformers_to_disable, cpu_ep); - #ifndef DISABLE_CONTRIB_OPS const std::unordered_set cuda_rocm_eps = {onnxruntime::kCudaExecutionProvider, onnxruntime::kRocmExecutionProvider}; @@ -238,10 +242,6 @@ std::vector> GenerateTransformers( ORT_THROW("Unsupported optimization level: ", static_cast(level)); } - if (rule_transformer != nullptr) { - transformers.emplace_back(std::move(rule_transformer)); - } - FilterTransformers(transformers, rules_and_transformers_to_disable); return transformers; diff --git a/onnxruntime/core/optimizer/transpose_optimizer/ort_transpose_optimizer.h b/onnxruntime/core/optimizer/transpose_optimizer/ort_transpose_optimizer.h index 46816caeee..9535e23a18 100644 --- a/onnxruntime/core/optimizer/transpose_optimizer/ort_transpose_optimizer.h +++ b/onnxruntime/core/optimizer/transpose_optimizer/ort_transpose_optimizer.h @@ -21,10 +21,6 @@ class TransposeOptimizer : public GraphTransformer { : GraphTransformer("TransposeOptimizer"), cpu_allocator_(std::move(cpu_allocator)) {} Status ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const override; - - // One run should be sufficient. Multiple runs should be ok but are prohibited to prevent any possibility of an - // infinite loop. - bool ShouldOnlyApplyOnce() const override { return true; } }; } // namespace onnxruntime diff --git a/onnxruntime/test/optimizer/transpose_optimizer_test.cc b/onnxruntime/test/optimizer/transpose_optimizer_test.cc index bc55562214..c4afac1b02 100644 --- a/onnxruntime/test/optimizer/transpose_optimizer_test.cc +++ b/onnxruntime/test/optimizer/transpose_optimizer_test.cc @@ -8,11 +8,12 @@ #include "graph_transform_test_builder.h" #include "core/graph/graph.h" +#include "test/test_environment.h" +#include "test/util/include/asserts.h" namespace onnxruntime { namespace test { - void SetNodeArgShape(NodeArg* node_arg, const std::optional>& shape) { if (shape == std::nullopt) { node_arg->ClearShape(); @@ -77,7 +78,6 @@ int EstimateTransposeCost(const Graph& graph) { return cost; } - TEST(TransposeOptimizerTests, TestSplit) { auto build_test_case_1 = [&](ModelTestBuilder& builder) { auto* input0_arg = builder.MakeInput({4, 6, 10}, 0.0, 1.0); @@ -3849,8 +3849,22 @@ TEST(TransposeOptimizerTests, TestOmitIdentityTranspose) { /*opset_version*/ 15); } +// regression test for a model where the transpose optimizations were not completed in a single pass in level 1. +// fixed by +// a) moving the RewriteRule level 1 optimizations so they run prior to the transpose optimizer; and +// b) not returning `true` from TransposeOptimizer::ShouldOnlyApplyOnce as it should be safe to run the +// transpose optimizer multiple times to ensure it completes in level 1. +// either of those changes would have fixed the issue. +// see https://github.com/microsoft/onnxruntime/issues/9671 for more details. +TEST(TransposeOptimizerTests, RegressionTest_GitHubIssue9671) { + auto model_uri = ORT_TSTR("testdata/gh_issue_9671.onnx"); - + SessionOptions so; + so.session_logid = "TransposeOptimizerTests.RegressionTest_GitHubIssue9671"; + InferenceSession session_object{so, GetEnvironment()}; + ASSERT_STATUS_OK(session_object.Load(model_uri)); + ASSERT_STATUS_OK(session_object.Initialize()); // optimizers run during initialization +} } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/testdata/gh_issue_9671.onnx b/onnxruntime/test/testdata/gh_issue_9671.onnx new file mode 100644 index 0000000000..53f1215402 Binary files /dev/null and b/onnxruntime/test/testdata/gh_issue_9671.onnx differ