mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-06-01 23:30:35 +00:00
[ROCm] add early stop to tunable profile progress (#15716)
For TunableOp, some instance may has very bad performance and it will take a long time during profile process. Add `tunable_op_max_tuning_duration_ms` parameter to limit max tuning time.
This commit is contained in:
parent
65b316a138
commit
1b518c6836
18 changed files with 96 additions and 6 deletions
|
|
@ -29,6 +29,7 @@ struct OrtCUDAProviderOptionsV2 {
|
|||
int cudnn_conv1d_pad_to_nc1d = 0; // flag specifying if pad Conv1D's input [N,C,D] to [N,C,1,D] or [N,C,D,1].
|
||||
int tunable_op_enable = 0; // flag specifying if TunableOp is enabled.
|
||||
int tunable_op_tuning_enable = 0; // flag specifying if TunableOp is enabled for tuning, this relies on TunableOp is enabled.
|
||||
int tunable_op_max_tuning_duration_ms = 0; // Max tuning duration time limit for TunableOp.
|
||||
int enable_skip_layer_norm_strict_mode = 0; // flag specifying if SkipLayerNorm is in strict mode. If true, use LayerNormalization kernel.
|
||||
// The strict mode has better accuracy but lower performance.
|
||||
};
|
||||
|
|
|
|||
|
|
@ -406,7 +406,8 @@ typedef struct OrtCUDAProviderOptions {
|
|||
user_compute_stream{},
|
||||
default_memory_arena_cfg{},
|
||||
tunable_op_enable{false},
|
||||
tunable_op_tuning_enable{false} {}
|
||||
tunable_op_tuning_enable{false},
|
||||
tunable_op_max_tuning_duration_ms{} {}
|
||||
#endif
|
||||
|
||||
/** \brief CUDA device Id
|
||||
|
|
@ -469,6 +470,11 @@ typedef struct OrtCUDAProviderOptions {
|
|||
*/
|
||||
int tunable_op_tuning_enable;
|
||||
|
||||
/** \brief Max tuning duration time limit for each instance of TunableOp.
|
||||
* Defaults to 0 to disable the limit.
|
||||
*/
|
||||
int tunable_op_max_tuning_duration_ms;
|
||||
|
||||
} OrtCUDAProviderOptions;
|
||||
|
||||
/** \brief ROCM Provider Options
|
||||
|
|
@ -487,7 +493,8 @@ typedef struct OrtROCMProviderOptions {
|
|||
user_compute_stream{},
|
||||
default_memory_arena_cfg{},
|
||||
tunable_op_enable{false},
|
||||
tunable_op_tuning_enable{false} {}
|
||||
tunable_op_tuning_enable{false},
|
||||
tunable_op_max_tuning_duration_ms{} {}
|
||||
#endif
|
||||
|
||||
/** \brief ROCM device Id
|
||||
|
|
@ -549,6 +556,11 @@ typedef struct OrtROCMProviderOptions {
|
|||
*/
|
||||
int tunable_op_tuning_enable;
|
||||
|
||||
/** \brief Max tuning duration time limit for each instance of TunableOp.
|
||||
* Defaults to 0 to disable the limit.
|
||||
*/
|
||||
int tunable_op_max_tuning_duration_ms;
|
||||
|
||||
} OrtROCMProviderOptions;
|
||||
|
||||
/** \brief TensorRT Provider Options
|
||||
|
|
|
|||
|
|
@ -209,14 +209,13 @@ class TunableOp {
|
|||
|
||||
private:
|
||||
static void WarmUp(Op<ParamsT>& op, const ParamsT* param) {
|
||||
constexpr const int num_iter = 4;
|
||||
constexpr const int num_iter = 1;
|
||||
for (int i = 0; i < num_iter; i++) {
|
||||
ORT_THROW_IF_ERROR(op(param));
|
||||
}
|
||||
}
|
||||
|
||||
static double Profile(Op<ParamsT>& op, const ParamsT* param) {
|
||||
constexpr const int num_iter = 100;
|
||||
static double Profile(Op<ParamsT>& op, const ParamsT* param, int num_iter) {
|
||||
TimerT timer{param->Stream()};
|
||||
timer.Start();
|
||||
for (int i = 0; i < num_iter; i++) {
|
||||
|
|
@ -242,12 +241,16 @@ class TunableOp {
|
|||
}
|
||||
|
||||
int FindFastestImpl(const ParamsT* params, const std::vector<Op<ParamsT>>& candidates) {
|
||||
ITuningContext* ctx = params->TuningContext();
|
||||
auto op_sig = Signature();
|
||||
auto param_sig = params->Signature();
|
||||
LOGS_DEFAULT(VERBOSE) << "FindFastestImpl for " << op_sig << '(' << param_sig << ')';
|
||||
auto min_time = std::numeric_limits<double>::infinity();
|
||||
int id = -1;
|
||||
|
||||
constexpr const int max_tuning_iter = 100;
|
||||
constexpr const int approx_num_iter = 3;
|
||||
|
||||
for (size_t i = 0; i < candidates.size(); i++) {
|
||||
auto& candidate = const_cast<Op<ParamsT>&>(candidates[i]);
|
||||
if (!IsSupported(candidate, params)) {
|
||||
|
|
@ -256,7 +259,13 @@ class TunableOp {
|
|||
}
|
||||
|
||||
WarmUp(candidate, params);
|
||||
auto time = Profile(candidate, params);
|
||||
|
||||
auto approx_duration = Profile(candidate, params, approx_num_iter);
|
||||
int tuning_iter = std::max(1, int(std::min(double(max_tuning_iter), ctx->GetMaxTuningDurationMs() / approx_duration)));
|
||||
|
||||
LOGS_DEFAULT(VERBOSE) << "FindFastestImpl run instance " << op_sig << '(' << param_sig << ") id=" << i << " " << tuning_iter << " times.";
|
||||
|
||||
auto time = Profile(candidate, params, tuning_iter);
|
||||
if (time < min_time) {
|
||||
min_time = time;
|
||||
id = static_cast<int>(i);
|
||||
|
|
|
|||
|
|
@ -28,6 +28,8 @@ class ITuningContext {
|
|||
virtual void DisableTuning() = 0;
|
||||
virtual bool IsTuningEnabled() const = 0;
|
||||
|
||||
virtual void SetMaxTuningDurationMs(int max_duration_ms) = 0;
|
||||
virtual int GetMaxTuningDurationMs() const = 0;
|
||||
virtual void EnableTunableOpAndTuning() final {
|
||||
EnableTunableOp();
|
||||
EnableTuning();
|
||||
|
|
|
|||
|
|
@ -26,6 +26,7 @@ constexpr const char* kEnableCudaGraph = "enable_cuda_graph";
|
|||
constexpr const char* kCudnnConv1dPadToNc1d = "cudnn_conv1d_pad_to_nc1d";
|
||||
constexpr const char* kTunableOpEnable = "tunable_op_enable";
|
||||
constexpr const char* kTunableOpTuningEnable = "tunable_op_tuning_enable";
|
||||
constexpr const char* kTunableOpMaxTuningDurationMs = "tunable_op_max_tuning_duration_ms";
|
||||
constexpr const char* kEnableSkipLayerNormStrictMode = "enable_skip_layer_norm_strict_mode";
|
||||
} // namespace provider_option_names
|
||||
} // namespace cuda
|
||||
|
|
@ -108,6 +109,12 @@ CUDAExecutionProviderInfo CUDAExecutionProviderInfo::FromProviderOptions(const P
|
|||
ORT_RETURN_IF_ERROR(ParseStringWithClassicLocale(value_str, info.tunable_op.tuning_enable));
|
||||
return Status::OK();
|
||||
})
|
||||
.AddValueParser(
|
||||
cuda::provider_option_names::kTunableOpMaxTuningDurationMs,
|
||||
[&info](const std::string& value_str) -> Status {
|
||||
ORT_RETURN_IF_ERROR(ParseStringWithClassicLocale(value_str, info.tunable_op.max_tuning_duration_ms));
|
||||
return Status::OK();
|
||||
})
|
||||
.Parse(options));
|
||||
|
||||
CUDAExecutionProviderExternalAllocatorInfo alloc_info{alloc, free, empty_cache};
|
||||
|
|
@ -132,6 +139,7 @@ ProviderOptions CUDAExecutionProviderInfo::ToProviderOptions(const CUDAExecution
|
|||
{cuda::provider_option_names::kCudnnConv1dPadToNc1d, MakeStringWithClassicLocale(info.cudnn_conv1d_pad_to_nc1d)},
|
||||
{cuda::provider_option_names::kTunableOpEnable, MakeStringWithClassicLocale(info.tunable_op.enable)},
|
||||
{cuda::provider_option_names::kTunableOpTuningEnable, MakeStringWithClassicLocale(info.tunable_op.tuning_enable)},
|
||||
{cuda::provider_option_names::kTunableOpMaxTuningDurationMs, MakeStringWithClassicLocale(info.tunable_op.max_tuning_duration_ms)},
|
||||
{cuda::provider_option_names::kEnableSkipLayerNormStrictMode, MakeStringWithClassicLocale(info.enable_skip_layer_norm_strict_mode)},
|
||||
};
|
||||
|
||||
|
|
@ -149,6 +157,7 @@ ProviderOptions CUDAExecutionProviderInfo::ToProviderOptions(const OrtCUDAProvid
|
|||
{cuda::provider_option_names::kCudnnConv1dPadToNc1d, MakeStringWithClassicLocale(info.cudnn_conv1d_pad_to_nc1d)},
|
||||
{cuda::provider_option_names::kTunableOpEnable, MakeStringWithClassicLocale(info.tunable_op_enable)},
|
||||
{cuda::provider_option_names::kTunableOpTuningEnable, MakeStringWithClassicLocale(info.tunable_op_tuning_enable)},
|
||||
{cuda::provider_option_names::kTunableOpMaxTuningDurationMs, MakeStringWithClassicLocale(info.tunable_op_max_tuning_duration_ms)},
|
||||
};
|
||||
|
||||
return options;
|
||||
|
|
|
|||
|
|
@ -40,6 +40,7 @@ namespace cuda {
|
|||
struct TunableOpInfo {
|
||||
bool enable{false};
|
||||
bool tuning_enable{false};
|
||||
int max_tuning_duration_ms{};
|
||||
};
|
||||
} // namespace cuda
|
||||
|
||||
|
|
@ -83,6 +84,7 @@ struct std::hash<::onnxruntime::cuda::TunableOpInfo> {
|
|||
size_t seed_and_value{0xbc9f1d34};
|
||||
onnxruntime::HashCombine(info.enable, seed_and_value);
|
||||
onnxruntime::HashCombine(info.tuning_enable, seed_and_value);
|
||||
onnxruntime::HashCombine(info.max_tuning_duration_ms, seed_and_value);
|
||||
return seed_and_value;
|
||||
}
|
||||
};
|
||||
|
|
|
|||
|
|
@ -252,6 +252,7 @@ struct CUDA_Provider : Provider {
|
|||
info.cudnn_conv1d_pad_to_nc1d = params->cudnn_conv1d_pad_to_nc1d != 0;
|
||||
info.tunable_op.enable = params->tunable_op_enable;
|
||||
info.tunable_op.tuning_enable = params->tunable_op_tuning_enable;
|
||||
info.tunable_op.max_tuning_duration_ms = params->tunable_op_max_tuning_duration_ms;
|
||||
info.enable_skip_layer_norm_strict_mode = params->enable_skip_layer_norm_strict_mode != 0;
|
||||
|
||||
return std::make_shared<CUDAProviderFactory>(info);
|
||||
|
|
|
|||
|
|
@ -77,6 +77,14 @@ bool CudaTuningContext::IsTuningEnabled() const {
|
|||
return info_->tuning_enable;
|
||||
}
|
||||
|
||||
void CudaTuningContext::SetMaxTuningDurationMs(int max_duration_ms) {
|
||||
info_->max_tuning_duration_ms = max_duration_ms;
|
||||
}
|
||||
|
||||
int CudaTuningContext::GetMaxTuningDurationMs() const {
|
||||
return info_->max_tuning_duration_ms > 0 ? info_->max_tuning_duration_ms : std::numeric_limits<int>::max();
|
||||
}
|
||||
|
||||
TuningResultsManager& CudaTuningContext::GetTuningResultsManager() {
|
||||
return manager_;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -39,6 +39,9 @@ class CudaTuningContext : public ITuningContext {
|
|||
void DisableTuning() override;
|
||||
bool IsTuningEnabled() const override;
|
||||
|
||||
void SetMaxTuningDurationMs(int max_duration_ms) override;
|
||||
int GetMaxTuningDurationMs() const override;
|
||||
|
||||
TuningResultsManager& GetTuningResultsManager() override;
|
||||
const TuningResultsManager& GetTuningResultsManager() const override;
|
||||
|
||||
|
|
|
|||
|
|
@ -23,6 +23,7 @@ constexpr const char* kGpuExternalEmptyCache = "gpu_external_empty_cache";
|
|||
constexpr const char* kMiopenConvUseMaxWorkspace = "miopen_conv_use_max_workspace";
|
||||
constexpr const char* kTunableOpEnable = "tunable_op_enable";
|
||||
constexpr const char* kTunableOpTuningEnable = "tunable_op_tuning_enable";
|
||||
constexpr const char* kTunableOpMaxTuningDurationMs = "tunable_op_max_tuning_duration_ms";
|
||||
} // namespace provider_option_names
|
||||
} // namespace rocm
|
||||
|
||||
|
|
@ -95,6 +96,12 @@ ROCMExecutionProviderInfo ROCMExecutionProviderInfo::FromProviderOptions(const P
|
|||
ORT_RETURN_IF_ERROR(ParseStringWithClassicLocale(value_str, info.tunable_op.tuning_enable));
|
||||
return Status::OK();
|
||||
})
|
||||
.AddValueParser(
|
||||
rocm::provider_option_names::kTunableOpMaxTuningDurationMs,
|
||||
[&info](const std::string& value_str) -> Status {
|
||||
ORT_RETURN_IF_ERROR(ParseStringWithClassicLocale(value_str, info.tunable_op.max_tuning_duration_ms));
|
||||
return Status::OK();
|
||||
})
|
||||
.Parse(options));
|
||||
|
||||
ROCMExecutionProviderExternalAllocatorInfo alloc_info{alloc, free, empty_cache};
|
||||
|
|
@ -116,6 +123,7 @@ ProviderOptions ROCMExecutionProviderInfo::ToProviderOptions(const ROCMExecution
|
|||
{rocm::provider_option_names::kMiopenConvUseMaxWorkspace, MakeStringWithClassicLocale(info.miopen_conv_use_max_workspace)},
|
||||
{rocm::provider_option_names::kTunableOpEnable, MakeStringWithClassicLocale(info.tunable_op.enable)},
|
||||
{rocm::provider_option_names::kTunableOpTuningEnable, MakeStringWithClassicLocale(info.tunable_op.tuning_enable)},
|
||||
{rocm::provider_option_names::kTunableOpMaxTuningDurationMs, MakeStringWithClassicLocale(info.tunable_op.max_tuning_duration_ms)},
|
||||
};
|
||||
|
||||
return options;
|
||||
|
|
@ -130,6 +138,7 @@ ProviderOptions ROCMExecutionProviderInfo::ToProviderOptions(const OrtROCMProvid
|
|||
{rocm::provider_option_names::kDoCopyInDefaultStream, MakeStringWithClassicLocale(info.do_copy_in_default_stream)},
|
||||
{rocm::provider_option_names::kTunableOpEnable, MakeStringWithClassicLocale(info.tunable_op_enable)},
|
||||
{rocm::provider_option_names::kTunableOpTuningEnable, MakeStringWithClassicLocale(info.tunable_op_tuning_enable)},
|
||||
{rocm::provider_option_names::kTunableOpMaxTuningDurationMs, MakeStringWithClassicLocale(info.tunable_op_max_tuning_duration_ms)},
|
||||
};
|
||||
|
||||
return options;
|
||||
|
|
|
|||
|
|
@ -40,6 +40,7 @@ namespace rocm {
|
|||
struct TunableOpInfo {
|
||||
bool enable{false};
|
||||
bool tuning_enable{false};
|
||||
int max_tuning_duration_ms{};
|
||||
};
|
||||
} // namespace rocm
|
||||
|
||||
|
|
@ -76,6 +77,7 @@ struct std::hash<::onnxruntime::rocm::TunableOpInfo> {
|
|||
size_t seed_and_value{0xbc9f1d34};
|
||||
onnxruntime::HashCombine(info.enable, seed_and_value);
|
||||
onnxruntime::HashCombine(info.tuning_enable, seed_and_value);
|
||||
onnxruntime::HashCombine(info.max_tuning_duration_ms, seed_and_value);
|
||||
return seed_and_value;
|
||||
}
|
||||
};
|
||||
|
|
|
|||
|
|
@ -175,6 +175,7 @@ struct ROCM_Provider : Provider {
|
|||
info.default_memory_arena_cfg = params->default_memory_arena_cfg;
|
||||
info.tunable_op.enable = params->tunable_op_enable;
|
||||
info.tunable_op.tuning_enable = params->tunable_op_tuning_enable;
|
||||
info.tunable_op.max_tuning_duration_ms = params->tunable_op_max_tuning_duration_ms;
|
||||
|
||||
return std::make_shared<ROCMProviderFactory>(info);
|
||||
}
|
||||
|
|
@ -193,6 +194,7 @@ struct ROCM_Provider : Provider {
|
|||
rocm_options.default_memory_arena_cfg = info.default_memory_arena_cfg;
|
||||
rocm_options.tunable_op_enable = info.tunable_op.enable;
|
||||
rocm_options.tunable_op_tuning_enable = info.tunable_op.tuning_enable;
|
||||
rocm_options.tunable_op_max_tuning_duration_ms = info.tunable_op.max_tuning_duration_ms;
|
||||
}
|
||||
|
||||
ProviderOptions GetProviderOptions(const void* provider_options) override {
|
||||
|
|
|
|||
|
|
@ -114,6 +114,14 @@ bool RocmTuningContext::IsTuningEnabled() const {
|
|||
return info_->tuning_enable;
|
||||
}
|
||||
|
||||
void RocmTuningContext::SetMaxTuningDurationMs(int max_duration_ms) {
|
||||
info_->max_tuning_duration_ms = max_duration_ms;
|
||||
}
|
||||
|
||||
int RocmTuningContext::GetMaxTuningDurationMs() const {
|
||||
return info_->max_tuning_duration_ms > 0 ? info_->max_tuning_duration_ms : std::numeric_limits<int>::max();
|
||||
}
|
||||
|
||||
TuningResultsManager& RocmTuningContext::GetTuningResultsManager() {
|
||||
return manager_;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -41,6 +41,9 @@ class RocmTuningContext : public ITuningContext {
|
|||
void DisableTuning() override;
|
||||
bool IsTuningEnabled() const override;
|
||||
|
||||
void SetMaxTuningDurationMs(int max_duration_ms) override;
|
||||
int GetMaxTuningDurationMs() const override;
|
||||
|
||||
TuningResultsManager& GetTuningResultsManager() override;
|
||||
const TuningResultsManager& GetTuningResultsManager() const override;
|
||||
|
||||
|
|
|
|||
|
|
@ -2086,6 +2086,7 @@ ORT_API_STATUS_IMPL(OrtApis::CreateROCMProviderOptions, _Outptr_ OrtROCMProvider
|
|||
options->default_memory_arena_cfg = nullptr;
|
||||
options->tunable_op_enable = 0;
|
||||
options->tunable_op_tuning_enable = 0;
|
||||
options->tunable_op_max_tuning_duration_ms = 0;
|
||||
|
||||
*out = options.release();
|
||||
return nullptr;
|
||||
|
|
|
|||
|
|
@ -57,6 +57,11 @@ class TestTuningContext : public ITuningContext {
|
|||
void DisableTuning() override { tuning_enabled_ = false; }
|
||||
bool IsTuningEnabled() const override { return tuning_enabled_; }
|
||||
|
||||
void SetMaxTuningDurationMs(int max_duration_ms) override { max_tuning_duration_ms_ = max_duration_ms; }
|
||||
int GetMaxTuningDurationMs() const override {
|
||||
return max_tuning_duration_ms_ > 0 ? max_tuning_duration_ms_ : std::numeric_limits<int>::max();
|
||||
}
|
||||
|
||||
TuningResultsManager& GetTuningResultsManager() override { return manager_; }
|
||||
const TuningResultsManager& GetTuningResultsManager() const override { return manager_; }
|
||||
|
||||
|
|
@ -67,6 +72,7 @@ class TestTuningContext : public ITuningContext {
|
|||
private:
|
||||
bool op_enabled_{false};
|
||||
bool tuning_enabled_{false};
|
||||
int max_tuning_duration_ms_{};
|
||||
TuningResultsManager manager_{};
|
||||
TestTuningResultsValidator validator_{};
|
||||
};
|
||||
|
|
@ -402,6 +408,13 @@ TEST(TunableOp, SelectFastIfTuning) {
|
|||
status = op(¶ms);
|
||||
ASSERT_TRUE(status.IsOK());
|
||||
ASSERT_EQ(last_run, "FastFull");
|
||||
|
||||
// Also set max_tuning_duration_ms, fast should be selected
|
||||
params.TuningContext()->SetMaxTuningDurationMs(10);
|
||||
status = op(¶ms);
|
||||
ASSERT_TRUE(status.IsOK());
|
||||
ASSERT_EQ(last_run, "FastFull");
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -259,6 +259,8 @@ class TestInferenceSession(unittest.TestCase):
|
|||
|
||||
test_get_and_set_option_with_values("tunable_op_tuning_enable", ["1", "0"])
|
||||
|
||||
test_get_and_set_option_with_values("tunable_op_max_tuning_duration_ms", ["-1", "1"])
|
||||
|
||||
option["gpu_external_alloc"] = "0"
|
||||
option["gpu_external_free"] = "0"
|
||||
option["gpu_external_empty_cache"] = "0"
|
||||
|
|
@ -395,6 +397,8 @@ class TestInferenceSession(unittest.TestCase):
|
|||
|
||||
test_get_and_set_option_with_values("tunable_op_tuning_enable", ["1", "0"])
|
||||
|
||||
test_get_and_set_option_with_values("tunable_op_max_tuning_duration_ms", ["-1", "1"])
|
||||
|
||||
runRocmOptionsTest()
|
||||
|
||||
def testInvalidSetProviders(self): # noqa: N802
|
||||
|
|
|
|||
|
|
@ -191,6 +191,7 @@ std::unique_ptr<IExecutionProvider> DefaultRocmExecutionProvider(bool test_tunab
|
|||
provider_options.do_copy_in_default_stream = true;
|
||||
provider_options.tunable_op_enable = test_tunable_op ? 1 : 0;
|
||||
provider_options.tunable_op_tuning_enable = test_tunable_op ? 1 : 0;
|
||||
provider_options.tunable_op_max_tuning_duration_ms = 0;
|
||||
if (auto factory = RocmProviderFactoryCreator::Create(&provider_options))
|
||||
return factory->CreateProvider();
|
||||
#endif
|
||||
|
|
|
|||
Loading…
Reference in a new issue