diff --git a/include/onnxruntime/core/providers/cuda/cuda_provider_options.h b/include/onnxruntime/core/providers/cuda/cuda_provider_options.h index 084e66fd81..eaf0e5337b 100644 --- a/include/onnxruntime/core/providers/cuda/cuda_provider_options.h +++ b/include/onnxruntime/core/providers/cuda/cuda_provider_options.h @@ -29,6 +29,7 @@ struct OrtCUDAProviderOptionsV2 { int cudnn_conv1d_pad_to_nc1d = 0; // flag specifying if pad Conv1D's input [N,C,D] to [N,C,1,D] or [N,C,D,1]. int tunable_op_enable = 0; // flag specifying if TunableOp is enabled. int tunable_op_tuning_enable = 0; // flag specifying if TunableOp is enabled for tuning, this relies on TunableOp is enabled. + int tunable_op_max_tuning_duration_ms = 0; // Max tuning duration time limit for TunableOp. int enable_skip_layer_norm_strict_mode = 0; // flag specifying if SkipLayerNorm is in strict mode. If true, use LayerNormalization kernel. // The strict mode has better accuracy but lower performance. }; diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h index 0be5f5e8b4..3c6df85c04 100644 --- a/include/onnxruntime/core/session/onnxruntime_c_api.h +++ b/include/onnxruntime/core/session/onnxruntime_c_api.h @@ -406,7 +406,8 @@ typedef struct OrtCUDAProviderOptions { user_compute_stream{}, default_memory_arena_cfg{}, tunable_op_enable{false}, - tunable_op_tuning_enable{false} {} + tunable_op_tuning_enable{false}, + tunable_op_max_tuning_duration_ms{} {} #endif /** \brief CUDA device Id @@ -469,6 +470,11 @@ typedef struct OrtCUDAProviderOptions { */ int tunable_op_tuning_enable; + /** \brief Max tuning duration time limit for each instance of TunableOp. + * Defaults to 0 to disable the limit. + */ + int tunable_op_max_tuning_duration_ms; + } OrtCUDAProviderOptions; /** \brief ROCM Provider Options @@ -487,7 +493,8 @@ typedef struct OrtROCMProviderOptions { user_compute_stream{}, default_memory_arena_cfg{}, tunable_op_enable{false}, - tunable_op_tuning_enable{false} {} + tunable_op_tuning_enable{false}, + tunable_op_max_tuning_duration_ms{} {} #endif /** \brief ROCM device Id @@ -549,6 +556,11 @@ typedef struct OrtROCMProviderOptions { */ int tunable_op_tuning_enable; + /** \brief Max tuning duration time limit for each instance of TunableOp. + * Defaults to 0 to disable the limit. + */ + int tunable_op_max_tuning_duration_ms; + } OrtROCMProviderOptions; /** \brief TensorRT Provider Options diff --git a/onnxruntime/core/framework/tunable.h b/onnxruntime/core/framework/tunable.h index 7c4467d348..3c3e957f75 100644 --- a/onnxruntime/core/framework/tunable.h +++ b/onnxruntime/core/framework/tunable.h @@ -209,14 +209,13 @@ class TunableOp { private: static void WarmUp(Op& op, const ParamsT* param) { - constexpr const int num_iter = 4; + constexpr const int num_iter = 1; for (int i = 0; i < num_iter; i++) { ORT_THROW_IF_ERROR(op(param)); } } - static double Profile(Op& op, const ParamsT* param) { - constexpr const int num_iter = 100; + static double Profile(Op& op, const ParamsT* param, int num_iter) { TimerT timer{param->Stream()}; timer.Start(); for (int i = 0; i < num_iter; i++) { @@ -242,12 +241,16 @@ class TunableOp { } int FindFastestImpl(const ParamsT* params, const std::vector>& candidates) { + ITuningContext* ctx = params->TuningContext(); auto op_sig = Signature(); auto param_sig = params->Signature(); LOGS_DEFAULT(VERBOSE) << "FindFastestImpl for " << op_sig << '(' << param_sig << ')'; auto min_time = std::numeric_limits::infinity(); int id = -1; + constexpr const int max_tuning_iter = 100; + constexpr const int approx_num_iter = 3; + for (size_t i = 0; i < candidates.size(); i++) { auto& candidate = const_cast&>(candidates[i]); if (!IsSupported(candidate, params)) { @@ -256,7 +259,13 @@ class TunableOp { } WarmUp(candidate, params); - auto time = Profile(candidate, params); + + auto approx_duration = Profile(candidate, params, approx_num_iter); + int tuning_iter = std::max(1, int(std::min(double(max_tuning_iter), ctx->GetMaxTuningDurationMs() / approx_duration))); + + LOGS_DEFAULT(VERBOSE) << "FindFastestImpl run instance " << op_sig << '(' << param_sig << ") id=" << i << " " << tuning_iter << " times."; + + auto time = Profile(candidate, params, tuning_iter); if (time < min_time) { min_time = time; id = static_cast(i); diff --git a/onnxruntime/core/framework/tuning_context.h b/onnxruntime/core/framework/tuning_context.h index 3fea4cb85f..5c5c595601 100644 --- a/onnxruntime/core/framework/tuning_context.h +++ b/onnxruntime/core/framework/tuning_context.h @@ -28,6 +28,8 @@ class ITuningContext { virtual void DisableTuning() = 0; virtual bool IsTuningEnabled() const = 0; + virtual void SetMaxTuningDurationMs(int max_duration_ms) = 0; + virtual int GetMaxTuningDurationMs() const = 0; virtual void EnableTunableOpAndTuning() final { EnableTunableOp(); EnableTuning(); diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider_info.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider_info.cc index bb840e9de2..e713dfc132 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider_info.cc +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider_info.cc @@ -26,6 +26,7 @@ constexpr const char* kEnableCudaGraph = "enable_cuda_graph"; constexpr const char* kCudnnConv1dPadToNc1d = "cudnn_conv1d_pad_to_nc1d"; constexpr const char* kTunableOpEnable = "tunable_op_enable"; constexpr const char* kTunableOpTuningEnable = "tunable_op_tuning_enable"; +constexpr const char* kTunableOpMaxTuningDurationMs = "tunable_op_max_tuning_duration_ms"; constexpr const char* kEnableSkipLayerNormStrictMode = "enable_skip_layer_norm_strict_mode"; } // namespace provider_option_names } // namespace cuda @@ -108,6 +109,12 @@ CUDAExecutionProviderInfo CUDAExecutionProviderInfo::FromProviderOptions(const P ORT_RETURN_IF_ERROR(ParseStringWithClassicLocale(value_str, info.tunable_op.tuning_enable)); return Status::OK(); }) + .AddValueParser( + cuda::provider_option_names::kTunableOpMaxTuningDurationMs, + [&info](const std::string& value_str) -> Status { + ORT_RETURN_IF_ERROR(ParseStringWithClassicLocale(value_str, info.tunable_op.max_tuning_duration_ms)); + return Status::OK(); + }) .Parse(options)); CUDAExecutionProviderExternalAllocatorInfo alloc_info{alloc, free, empty_cache}; @@ -132,6 +139,7 @@ ProviderOptions CUDAExecutionProviderInfo::ToProviderOptions(const CUDAExecution {cuda::provider_option_names::kCudnnConv1dPadToNc1d, MakeStringWithClassicLocale(info.cudnn_conv1d_pad_to_nc1d)}, {cuda::provider_option_names::kTunableOpEnable, MakeStringWithClassicLocale(info.tunable_op.enable)}, {cuda::provider_option_names::kTunableOpTuningEnable, MakeStringWithClassicLocale(info.tunable_op.tuning_enable)}, + {cuda::provider_option_names::kTunableOpMaxTuningDurationMs, MakeStringWithClassicLocale(info.tunable_op.max_tuning_duration_ms)}, {cuda::provider_option_names::kEnableSkipLayerNormStrictMode, MakeStringWithClassicLocale(info.enable_skip_layer_norm_strict_mode)}, }; @@ -149,6 +157,7 @@ ProviderOptions CUDAExecutionProviderInfo::ToProviderOptions(const OrtCUDAProvid {cuda::provider_option_names::kCudnnConv1dPadToNc1d, MakeStringWithClassicLocale(info.cudnn_conv1d_pad_to_nc1d)}, {cuda::provider_option_names::kTunableOpEnable, MakeStringWithClassicLocale(info.tunable_op_enable)}, {cuda::provider_option_names::kTunableOpTuningEnable, MakeStringWithClassicLocale(info.tunable_op_tuning_enable)}, + {cuda::provider_option_names::kTunableOpMaxTuningDurationMs, MakeStringWithClassicLocale(info.tunable_op_max_tuning_duration_ms)}, }; return options; diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider_info.h b/onnxruntime/core/providers/cuda/cuda_execution_provider_info.h index c268518974..789b02b0e1 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider_info.h +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider_info.h @@ -40,6 +40,7 @@ namespace cuda { struct TunableOpInfo { bool enable{false}; bool tuning_enable{false}; + int max_tuning_duration_ms{}; }; } // namespace cuda @@ -83,6 +84,7 @@ struct std::hash<::onnxruntime::cuda::TunableOpInfo> { size_t seed_and_value{0xbc9f1d34}; onnxruntime::HashCombine(info.enable, seed_and_value); onnxruntime::HashCombine(info.tuning_enable, seed_and_value); + onnxruntime::HashCombine(info.max_tuning_duration_ms, seed_and_value); return seed_and_value; } }; diff --git a/onnxruntime/core/providers/cuda/cuda_provider_factory.cc b/onnxruntime/core/providers/cuda/cuda_provider_factory.cc index 6e8c4d0ff0..d9fd8606e7 100644 --- a/onnxruntime/core/providers/cuda/cuda_provider_factory.cc +++ b/onnxruntime/core/providers/cuda/cuda_provider_factory.cc @@ -252,6 +252,7 @@ struct CUDA_Provider : Provider { info.cudnn_conv1d_pad_to_nc1d = params->cudnn_conv1d_pad_to_nc1d != 0; info.tunable_op.enable = params->tunable_op_enable; info.tunable_op.tuning_enable = params->tunable_op_tuning_enable; + info.tunable_op.max_tuning_duration_ms = params->tunable_op_max_tuning_duration_ms; info.enable_skip_layer_norm_strict_mode = params->enable_skip_layer_norm_strict_mode != 0; return std::make_shared(info); diff --git a/onnxruntime/core/providers/cuda/tunable/cuda_tuning_context.cc b/onnxruntime/core/providers/cuda/tunable/cuda_tuning_context.cc index 4e302a2c66..b427b53fb6 100644 --- a/onnxruntime/core/providers/cuda/tunable/cuda_tuning_context.cc +++ b/onnxruntime/core/providers/cuda/tunable/cuda_tuning_context.cc @@ -77,6 +77,14 @@ bool CudaTuningContext::IsTuningEnabled() const { return info_->tuning_enable; } +void CudaTuningContext::SetMaxTuningDurationMs(int max_duration_ms) { + info_->max_tuning_duration_ms = max_duration_ms; +} + +int CudaTuningContext::GetMaxTuningDurationMs() const { + return info_->max_tuning_duration_ms > 0 ? info_->max_tuning_duration_ms : std::numeric_limits::max(); +} + TuningResultsManager& CudaTuningContext::GetTuningResultsManager() { return manager_; } diff --git a/onnxruntime/core/providers/cuda/tunable/cuda_tuning_context.h b/onnxruntime/core/providers/cuda/tunable/cuda_tuning_context.h index ec961890e6..2184a5d60b 100644 --- a/onnxruntime/core/providers/cuda/tunable/cuda_tuning_context.h +++ b/onnxruntime/core/providers/cuda/tunable/cuda_tuning_context.h @@ -39,6 +39,9 @@ class CudaTuningContext : public ITuningContext { void DisableTuning() override; bool IsTuningEnabled() const override; + void SetMaxTuningDurationMs(int max_duration_ms) override; + int GetMaxTuningDurationMs() const override; + TuningResultsManager& GetTuningResultsManager() override; const TuningResultsManager& GetTuningResultsManager() const override; diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider_info.cc b/onnxruntime/core/providers/rocm/rocm_execution_provider_info.cc index 2901c1a83d..91e3aaaa42 100644 --- a/onnxruntime/core/providers/rocm/rocm_execution_provider_info.cc +++ b/onnxruntime/core/providers/rocm/rocm_execution_provider_info.cc @@ -23,6 +23,7 @@ constexpr const char* kGpuExternalEmptyCache = "gpu_external_empty_cache"; constexpr const char* kMiopenConvUseMaxWorkspace = "miopen_conv_use_max_workspace"; constexpr const char* kTunableOpEnable = "tunable_op_enable"; constexpr const char* kTunableOpTuningEnable = "tunable_op_tuning_enable"; +constexpr const char* kTunableOpMaxTuningDurationMs = "tunable_op_max_tuning_duration_ms"; } // namespace provider_option_names } // namespace rocm @@ -95,6 +96,12 @@ ROCMExecutionProviderInfo ROCMExecutionProviderInfo::FromProviderOptions(const P ORT_RETURN_IF_ERROR(ParseStringWithClassicLocale(value_str, info.tunable_op.tuning_enable)); return Status::OK(); }) + .AddValueParser( + rocm::provider_option_names::kTunableOpMaxTuningDurationMs, + [&info](const std::string& value_str) -> Status { + ORT_RETURN_IF_ERROR(ParseStringWithClassicLocale(value_str, info.tunable_op.max_tuning_duration_ms)); + return Status::OK(); + }) .Parse(options)); ROCMExecutionProviderExternalAllocatorInfo alloc_info{alloc, free, empty_cache}; @@ -116,6 +123,7 @@ ProviderOptions ROCMExecutionProviderInfo::ToProviderOptions(const ROCMExecution {rocm::provider_option_names::kMiopenConvUseMaxWorkspace, MakeStringWithClassicLocale(info.miopen_conv_use_max_workspace)}, {rocm::provider_option_names::kTunableOpEnable, MakeStringWithClassicLocale(info.tunable_op.enable)}, {rocm::provider_option_names::kTunableOpTuningEnable, MakeStringWithClassicLocale(info.tunable_op.tuning_enable)}, + {rocm::provider_option_names::kTunableOpMaxTuningDurationMs, MakeStringWithClassicLocale(info.tunable_op.max_tuning_duration_ms)}, }; return options; @@ -130,6 +138,7 @@ ProviderOptions ROCMExecutionProviderInfo::ToProviderOptions(const OrtROCMProvid {rocm::provider_option_names::kDoCopyInDefaultStream, MakeStringWithClassicLocale(info.do_copy_in_default_stream)}, {rocm::provider_option_names::kTunableOpEnable, MakeStringWithClassicLocale(info.tunable_op_enable)}, {rocm::provider_option_names::kTunableOpTuningEnable, MakeStringWithClassicLocale(info.tunable_op_tuning_enable)}, + {rocm::provider_option_names::kTunableOpMaxTuningDurationMs, MakeStringWithClassicLocale(info.tunable_op_max_tuning_duration_ms)}, }; return options; diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider_info.h b/onnxruntime/core/providers/rocm/rocm_execution_provider_info.h index 5b1fd09589..e35c0cc0af 100644 --- a/onnxruntime/core/providers/rocm/rocm_execution_provider_info.h +++ b/onnxruntime/core/providers/rocm/rocm_execution_provider_info.h @@ -40,6 +40,7 @@ namespace rocm { struct TunableOpInfo { bool enable{false}; bool tuning_enable{false}; + int max_tuning_duration_ms{}; }; } // namespace rocm @@ -76,6 +77,7 @@ struct std::hash<::onnxruntime::rocm::TunableOpInfo> { size_t seed_and_value{0xbc9f1d34}; onnxruntime::HashCombine(info.enable, seed_and_value); onnxruntime::HashCombine(info.tuning_enable, seed_and_value); + onnxruntime::HashCombine(info.max_tuning_duration_ms, seed_and_value); return seed_and_value; } }; diff --git a/onnxruntime/core/providers/rocm/rocm_provider_factory.cc b/onnxruntime/core/providers/rocm/rocm_provider_factory.cc index e1a07a3486..f452c1ae82 100644 --- a/onnxruntime/core/providers/rocm/rocm_provider_factory.cc +++ b/onnxruntime/core/providers/rocm/rocm_provider_factory.cc @@ -175,6 +175,7 @@ struct ROCM_Provider : Provider { info.default_memory_arena_cfg = params->default_memory_arena_cfg; info.tunable_op.enable = params->tunable_op_enable; info.tunable_op.tuning_enable = params->tunable_op_tuning_enable; + info.tunable_op.max_tuning_duration_ms = params->tunable_op_max_tuning_duration_ms; return std::make_shared(info); } @@ -193,6 +194,7 @@ struct ROCM_Provider : Provider { rocm_options.default_memory_arena_cfg = info.default_memory_arena_cfg; rocm_options.tunable_op_enable = info.tunable_op.enable; rocm_options.tunable_op_tuning_enable = info.tunable_op.tuning_enable; + rocm_options.tunable_op_max_tuning_duration_ms = info.tunable_op.max_tuning_duration_ms; } ProviderOptions GetProviderOptions(const void* provider_options) override { diff --git a/onnxruntime/core/providers/rocm/tunable/rocm_tuning_context.cc b/onnxruntime/core/providers/rocm/tunable/rocm_tuning_context.cc index 1cc45ce840..4a1b37387b 100644 --- a/onnxruntime/core/providers/rocm/tunable/rocm_tuning_context.cc +++ b/onnxruntime/core/providers/rocm/tunable/rocm_tuning_context.cc @@ -114,6 +114,14 @@ bool RocmTuningContext::IsTuningEnabled() const { return info_->tuning_enable; } +void RocmTuningContext::SetMaxTuningDurationMs(int max_duration_ms) { + info_->max_tuning_duration_ms = max_duration_ms; +} + +int RocmTuningContext::GetMaxTuningDurationMs() const { + return info_->max_tuning_duration_ms > 0 ? info_->max_tuning_duration_ms : std::numeric_limits::max(); +} + TuningResultsManager& RocmTuningContext::GetTuningResultsManager() { return manager_; } diff --git a/onnxruntime/core/providers/rocm/tunable/rocm_tuning_context.h b/onnxruntime/core/providers/rocm/tunable/rocm_tuning_context.h index d2ddb37dd0..f0feb1cb74 100644 --- a/onnxruntime/core/providers/rocm/tunable/rocm_tuning_context.h +++ b/onnxruntime/core/providers/rocm/tunable/rocm_tuning_context.h @@ -41,6 +41,9 @@ class RocmTuningContext : public ITuningContext { void DisableTuning() override; bool IsTuningEnabled() const override; + void SetMaxTuningDurationMs(int max_duration_ms) override; + int GetMaxTuningDurationMs() const override; + TuningResultsManager& GetTuningResultsManager() override; const TuningResultsManager& GetTuningResultsManager() const override; diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc index 68891489d1..f3a424f085 100644 --- a/onnxruntime/core/session/provider_bridge_ort.cc +++ b/onnxruntime/core/session/provider_bridge_ort.cc @@ -2086,6 +2086,7 @@ ORT_API_STATUS_IMPL(OrtApis::CreateROCMProviderOptions, _Outptr_ OrtROCMProvider options->default_memory_arena_cfg = nullptr; options->tunable_op_enable = 0; options->tunable_op_tuning_enable = 0; + options->tunable_op_max_tuning_duration_ms = 0; *out = options.release(); return nullptr; diff --git a/onnxruntime/test/framework/tunable_op_test.cc b/onnxruntime/test/framework/tunable_op_test.cc index 6793b1c49c..4565a99e87 100644 --- a/onnxruntime/test/framework/tunable_op_test.cc +++ b/onnxruntime/test/framework/tunable_op_test.cc @@ -57,6 +57,11 @@ class TestTuningContext : public ITuningContext { void DisableTuning() override { tuning_enabled_ = false; } bool IsTuningEnabled() const override { return tuning_enabled_; } + void SetMaxTuningDurationMs(int max_duration_ms) override { max_tuning_duration_ms_ = max_duration_ms; } + int GetMaxTuningDurationMs() const override { + return max_tuning_duration_ms_ > 0 ? max_tuning_duration_ms_ : std::numeric_limits::max(); + } + TuningResultsManager& GetTuningResultsManager() override { return manager_; } const TuningResultsManager& GetTuningResultsManager() const override { return manager_; } @@ -67,6 +72,7 @@ class TestTuningContext : public ITuningContext { private: bool op_enabled_{false}; bool tuning_enabled_{false}; + int max_tuning_duration_ms_{}; TuningResultsManager manager_{}; TestTuningResultsValidator validator_{}; }; @@ -402,6 +408,13 @@ TEST(TunableOp, SelectFastIfTuning) { status = op(¶ms); ASSERT_TRUE(status.IsOK()); ASSERT_EQ(last_run, "FastFull"); + + // Also set max_tuning_duration_ms, fast should be selected + params.TuningContext()->SetMaxTuningDurationMs(10); + status = op(¶ms); + ASSERT_TRUE(status.IsOK()); + ASSERT_EQ(last_run, "FastFull"); + #endif } diff --git a/onnxruntime/test/python/onnxruntime_test_python.py b/onnxruntime/test/python/onnxruntime_test_python.py index b49547a040..3bea82b12c 100644 --- a/onnxruntime/test/python/onnxruntime_test_python.py +++ b/onnxruntime/test/python/onnxruntime_test_python.py @@ -259,6 +259,8 @@ class TestInferenceSession(unittest.TestCase): test_get_and_set_option_with_values("tunable_op_tuning_enable", ["1", "0"]) + test_get_and_set_option_with_values("tunable_op_max_tuning_duration_ms", ["-1", "1"]) + option["gpu_external_alloc"] = "0" option["gpu_external_free"] = "0" option["gpu_external_empty_cache"] = "0" @@ -395,6 +397,8 @@ class TestInferenceSession(unittest.TestCase): test_get_and_set_option_with_values("tunable_op_tuning_enable", ["1", "0"]) + test_get_and_set_option_with_values("tunable_op_max_tuning_duration_ms", ["-1", "1"]) + runRocmOptionsTest() def testInvalidSetProviders(self): # noqa: N802 diff --git a/onnxruntime/test/util/default_providers.cc b/onnxruntime/test/util/default_providers.cc index d5847ddd4e..da8710eea8 100644 --- a/onnxruntime/test/util/default_providers.cc +++ b/onnxruntime/test/util/default_providers.cc @@ -191,6 +191,7 @@ std::unique_ptr DefaultRocmExecutionProvider(bool test_tunab provider_options.do_copy_in_default_stream = true; provider_options.tunable_op_enable = test_tunable_op ? 1 : 0; provider_options.tunable_op_tuning_enable = test_tunable_op ? 1 : 0; + provider_options.tunable_op_max_tuning_duration_ms = 0; if (auto factory = RocmProviderFactoryCreator::Create(&provider_options)) return factory->CreateProvider(); #endif