[ROCm] add early stop to tunable profile progress (#15716)

For TunableOp, some instance may has very bad performance and it will
take a long time during profile process.
Add `tunable_op_max_tuning_duration_ms` parameter to limit max tuning
time.
This commit is contained in:
PeixuanZuo 2023-06-01 10:18:25 +08:00 committed by GitHub
parent 65b316a138
commit 1b518c6836
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
18 changed files with 96 additions and 6 deletions

View file

@ -29,6 +29,7 @@ struct OrtCUDAProviderOptionsV2 {
int cudnn_conv1d_pad_to_nc1d = 0; // flag specifying if pad Conv1D's input [N,C,D] to [N,C,1,D] or [N,C,D,1].
int tunable_op_enable = 0; // flag specifying if TunableOp is enabled.
int tunable_op_tuning_enable = 0; // flag specifying if TunableOp is enabled for tuning, this relies on TunableOp is enabled.
int tunable_op_max_tuning_duration_ms = 0; // Max tuning duration time limit for TunableOp.
int enable_skip_layer_norm_strict_mode = 0; // flag specifying if SkipLayerNorm is in strict mode. If true, use LayerNormalization kernel.
// The strict mode has better accuracy but lower performance.
};

View file

@ -406,7 +406,8 @@ typedef struct OrtCUDAProviderOptions {
user_compute_stream{},
default_memory_arena_cfg{},
tunable_op_enable{false},
tunable_op_tuning_enable{false} {}
tunable_op_tuning_enable{false},
tunable_op_max_tuning_duration_ms{} {}
#endif
/** \brief CUDA device Id
@ -469,6 +470,11 @@ typedef struct OrtCUDAProviderOptions {
*/
int tunable_op_tuning_enable;
/** \brief Max tuning duration time limit for each instance of TunableOp.
* Defaults to 0 to disable the limit.
*/
int tunable_op_max_tuning_duration_ms;
} OrtCUDAProviderOptions;
/** \brief ROCM Provider Options
@ -487,7 +493,8 @@ typedef struct OrtROCMProviderOptions {
user_compute_stream{},
default_memory_arena_cfg{},
tunable_op_enable{false},
tunable_op_tuning_enable{false} {}
tunable_op_tuning_enable{false},
tunable_op_max_tuning_duration_ms{} {}
#endif
/** \brief ROCM device Id
@ -549,6 +556,11 @@ typedef struct OrtROCMProviderOptions {
*/
int tunable_op_tuning_enable;
/** \brief Max tuning duration time limit for each instance of TunableOp.
* Defaults to 0 to disable the limit.
*/
int tunable_op_max_tuning_duration_ms;
} OrtROCMProviderOptions;
/** \brief TensorRT Provider Options

View file

@ -209,14 +209,13 @@ class TunableOp {
private:
static void WarmUp(Op<ParamsT>& op, const ParamsT* param) {
constexpr const int num_iter = 4;
constexpr const int num_iter = 1;
for (int i = 0; i < num_iter; i++) {
ORT_THROW_IF_ERROR(op(param));
}
}
static double Profile(Op<ParamsT>& op, const ParamsT* param) {
constexpr const int num_iter = 100;
static double Profile(Op<ParamsT>& op, const ParamsT* param, int num_iter) {
TimerT timer{param->Stream()};
timer.Start();
for (int i = 0; i < num_iter; i++) {
@ -242,12 +241,16 @@ class TunableOp {
}
int FindFastestImpl(const ParamsT* params, const std::vector<Op<ParamsT>>& candidates) {
ITuningContext* ctx = params->TuningContext();
auto op_sig = Signature();
auto param_sig = params->Signature();
LOGS_DEFAULT(VERBOSE) << "FindFastestImpl for " << op_sig << '(' << param_sig << ')';
auto min_time = std::numeric_limits<double>::infinity();
int id = -1;
constexpr const int max_tuning_iter = 100;
constexpr const int approx_num_iter = 3;
for (size_t i = 0; i < candidates.size(); i++) {
auto& candidate = const_cast<Op<ParamsT>&>(candidates[i]);
if (!IsSupported(candidate, params)) {
@ -256,7 +259,13 @@ class TunableOp {
}
WarmUp(candidate, params);
auto time = Profile(candidate, params);
auto approx_duration = Profile(candidate, params, approx_num_iter);
int tuning_iter = std::max(1, int(std::min(double(max_tuning_iter), ctx->GetMaxTuningDurationMs() / approx_duration)));
LOGS_DEFAULT(VERBOSE) << "FindFastestImpl run instance " << op_sig << '(' << param_sig << ") id=" << i << " " << tuning_iter << " times.";
auto time = Profile(candidate, params, tuning_iter);
if (time < min_time) {
min_time = time;
id = static_cast<int>(i);

View file

@ -28,6 +28,8 @@ class ITuningContext {
virtual void DisableTuning() = 0;
virtual bool IsTuningEnabled() const = 0;
virtual void SetMaxTuningDurationMs(int max_duration_ms) = 0;
virtual int GetMaxTuningDurationMs() const = 0;
virtual void EnableTunableOpAndTuning() final {
EnableTunableOp();
EnableTuning();

View file

@ -26,6 +26,7 @@ constexpr const char* kEnableCudaGraph = "enable_cuda_graph";
constexpr const char* kCudnnConv1dPadToNc1d = "cudnn_conv1d_pad_to_nc1d";
constexpr const char* kTunableOpEnable = "tunable_op_enable";
constexpr const char* kTunableOpTuningEnable = "tunable_op_tuning_enable";
constexpr const char* kTunableOpMaxTuningDurationMs = "tunable_op_max_tuning_duration_ms";
constexpr const char* kEnableSkipLayerNormStrictMode = "enable_skip_layer_norm_strict_mode";
} // namespace provider_option_names
} // namespace cuda
@ -108,6 +109,12 @@ CUDAExecutionProviderInfo CUDAExecutionProviderInfo::FromProviderOptions(const P
ORT_RETURN_IF_ERROR(ParseStringWithClassicLocale(value_str, info.tunable_op.tuning_enable));
return Status::OK();
})
.AddValueParser(
cuda::provider_option_names::kTunableOpMaxTuningDurationMs,
[&info](const std::string& value_str) -> Status {
ORT_RETURN_IF_ERROR(ParseStringWithClassicLocale(value_str, info.tunable_op.max_tuning_duration_ms));
return Status::OK();
})
.Parse(options));
CUDAExecutionProviderExternalAllocatorInfo alloc_info{alloc, free, empty_cache};
@ -132,6 +139,7 @@ ProviderOptions CUDAExecutionProviderInfo::ToProviderOptions(const CUDAExecution
{cuda::provider_option_names::kCudnnConv1dPadToNc1d, MakeStringWithClassicLocale(info.cudnn_conv1d_pad_to_nc1d)},
{cuda::provider_option_names::kTunableOpEnable, MakeStringWithClassicLocale(info.tunable_op.enable)},
{cuda::provider_option_names::kTunableOpTuningEnable, MakeStringWithClassicLocale(info.tunable_op.tuning_enable)},
{cuda::provider_option_names::kTunableOpMaxTuningDurationMs, MakeStringWithClassicLocale(info.tunable_op.max_tuning_duration_ms)},
{cuda::provider_option_names::kEnableSkipLayerNormStrictMode, MakeStringWithClassicLocale(info.enable_skip_layer_norm_strict_mode)},
};
@ -149,6 +157,7 @@ ProviderOptions CUDAExecutionProviderInfo::ToProviderOptions(const OrtCUDAProvid
{cuda::provider_option_names::kCudnnConv1dPadToNc1d, MakeStringWithClassicLocale(info.cudnn_conv1d_pad_to_nc1d)},
{cuda::provider_option_names::kTunableOpEnable, MakeStringWithClassicLocale(info.tunable_op_enable)},
{cuda::provider_option_names::kTunableOpTuningEnable, MakeStringWithClassicLocale(info.tunable_op_tuning_enable)},
{cuda::provider_option_names::kTunableOpMaxTuningDurationMs, MakeStringWithClassicLocale(info.tunable_op_max_tuning_duration_ms)},
};
return options;

View file

@ -40,6 +40,7 @@ namespace cuda {
struct TunableOpInfo {
bool enable{false};
bool tuning_enable{false};
int max_tuning_duration_ms{};
};
} // namespace cuda
@ -83,6 +84,7 @@ struct std::hash<::onnxruntime::cuda::TunableOpInfo> {
size_t seed_and_value{0xbc9f1d34};
onnxruntime::HashCombine(info.enable, seed_and_value);
onnxruntime::HashCombine(info.tuning_enable, seed_and_value);
onnxruntime::HashCombine(info.max_tuning_duration_ms, seed_and_value);
return seed_and_value;
}
};

View file

@ -252,6 +252,7 @@ struct CUDA_Provider : Provider {
info.cudnn_conv1d_pad_to_nc1d = params->cudnn_conv1d_pad_to_nc1d != 0;
info.tunable_op.enable = params->tunable_op_enable;
info.tunable_op.tuning_enable = params->tunable_op_tuning_enable;
info.tunable_op.max_tuning_duration_ms = params->tunable_op_max_tuning_duration_ms;
info.enable_skip_layer_norm_strict_mode = params->enable_skip_layer_norm_strict_mode != 0;
return std::make_shared<CUDAProviderFactory>(info);

View file

@ -77,6 +77,14 @@ bool CudaTuningContext::IsTuningEnabled() const {
return info_->tuning_enable;
}
void CudaTuningContext::SetMaxTuningDurationMs(int max_duration_ms) {
info_->max_tuning_duration_ms = max_duration_ms;
}
int CudaTuningContext::GetMaxTuningDurationMs() const {
return info_->max_tuning_duration_ms > 0 ? info_->max_tuning_duration_ms : std::numeric_limits<int>::max();
}
TuningResultsManager& CudaTuningContext::GetTuningResultsManager() {
return manager_;
}

View file

@ -39,6 +39,9 @@ class CudaTuningContext : public ITuningContext {
void DisableTuning() override;
bool IsTuningEnabled() const override;
void SetMaxTuningDurationMs(int max_duration_ms) override;
int GetMaxTuningDurationMs() const override;
TuningResultsManager& GetTuningResultsManager() override;
const TuningResultsManager& GetTuningResultsManager() const override;

View file

@ -23,6 +23,7 @@ constexpr const char* kGpuExternalEmptyCache = "gpu_external_empty_cache";
constexpr const char* kMiopenConvUseMaxWorkspace = "miopen_conv_use_max_workspace";
constexpr const char* kTunableOpEnable = "tunable_op_enable";
constexpr const char* kTunableOpTuningEnable = "tunable_op_tuning_enable";
constexpr const char* kTunableOpMaxTuningDurationMs = "tunable_op_max_tuning_duration_ms";
} // namespace provider_option_names
} // namespace rocm
@ -95,6 +96,12 @@ ROCMExecutionProviderInfo ROCMExecutionProviderInfo::FromProviderOptions(const P
ORT_RETURN_IF_ERROR(ParseStringWithClassicLocale(value_str, info.tunable_op.tuning_enable));
return Status::OK();
})
.AddValueParser(
rocm::provider_option_names::kTunableOpMaxTuningDurationMs,
[&info](const std::string& value_str) -> Status {
ORT_RETURN_IF_ERROR(ParseStringWithClassicLocale(value_str, info.tunable_op.max_tuning_duration_ms));
return Status::OK();
})
.Parse(options));
ROCMExecutionProviderExternalAllocatorInfo alloc_info{alloc, free, empty_cache};
@ -116,6 +123,7 @@ ProviderOptions ROCMExecutionProviderInfo::ToProviderOptions(const ROCMExecution
{rocm::provider_option_names::kMiopenConvUseMaxWorkspace, MakeStringWithClassicLocale(info.miopen_conv_use_max_workspace)},
{rocm::provider_option_names::kTunableOpEnable, MakeStringWithClassicLocale(info.tunable_op.enable)},
{rocm::provider_option_names::kTunableOpTuningEnable, MakeStringWithClassicLocale(info.tunable_op.tuning_enable)},
{rocm::provider_option_names::kTunableOpMaxTuningDurationMs, MakeStringWithClassicLocale(info.tunable_op.max_tuning_duration_ms)},
};
return options;
@ -130,6 +138,7 @@ ProviderOptions ROCMExecutionProviderInfo::ToProviderOptions(const OrtROCMProvid
{rocm::provider_option_names::kDoCopyInDefaultStream, MakeStringWithClassicLocale(info.do_copy_in_default_stream)},
{rocm::provider_option_names::kTunableOpEnable, MakeStringWithClassicLocale(info.tunable_op_enable)},
{rocm::provider_option_names::kTunableOpTuningEnable, MakeStringWithClassicLocale(info.tunable_op_tuning_enable)},
{rocm::provider_option_names::kTunableOpMaxTuningDurationMs, MakeStringWithClassicLocale(info.tunable_op_max_tuning_duration_ms)},
};
return options;

View file

@ -40,6 +40,7 @@ namespace rocm {
struct TunableOpInfo {
bool enable{false};
bool tuning_enable{false};
int max_tuning_duration_ms{};
};
} // namespace rocm
@ -76,6 +77,7 @@ struct std::hash<::onnxruntime::rocm::TunableOpInfo> {
size_t seed_and_value{0xbc9f1d34};
onnxruntime::HashCombine(info.enable, seed_and_value);
onnxruntime::HashCombine(info.tuning_enable, seed_and_value);
onnxruntime::HashCombine(info.max_tuning_duration_ms, seed_and_value);
return seed_and_value;
}
};

View file

@ -175,6 +175,7 @@ struct ROCM_Provider : Provider {
info.default_memory_arena_cfg = params->default_memory_arena_cfg;
info.tunable_op.enable = params->tunable_op_enable;
info.tunable_op.tuning_enable = params->tunable_op_tuning_enable;
info.tunable_op.max_tuning_duration_ms = params->tunable_op_max_tuning_duration_ms;
return std::make_shared<ROCMProviderFactory>(info);
}
@ -193,6 +194,7 @@ struct ROCM_Provider : Provider {
rocm_options.default_memory_arena_cfg = info.default_memory_arena_cfg;
rocm_options.tunable_op_enable = info.tunable_op.enable;
rocm_options.tunable_op_tuning_enable = info.tunable_op.tuning_enable;
rocm_options.tunable_op_max_tuning_duration_ms = info.tunable_op.max_tuning_duration_ms;
}
ProviderOptions GetProviderOptions(const void* provider_options) override {

View file

@ -114,6 +114,14 @@ bool RocmTuningContext::IsTuningEnabled() const {
return info_->tuning_enable;
}
void RocmTuningContext::SetMaxTuningDurationMs(int max_duration_ms) {
info_->max_tuning_duration_ms = max_duration_ms;
}
int RocmTuningContext::GetMaxTuningDurationMs() const {
return info_->max_tuning_duration_ms > 0 ? info_->max_tuning_duration_ms : std::numeric_limits<int>::max();
}
TuningResultsManager& RocmTuningContext::GetTuningResultsManager() {
return manager_;
}

View file

@ -41,6 +41,9 @@ class RocmTuningContext : public ITuningContext {
void DisableTuning() override;
bool IsTuningEnabled() const override;
void SetMaxTuningDurationMs(int max_duration_ms) override;
int GetMaxTuningDurationMs() const override;
TuningResultsManager& GetTuningResultsManager() override;
const TuningResultsManager& GetTuningResultsManager() const override;

View file

@ -2086,6 +2086,7 @@ ORT_API_STATUS_IMPL(OrtApis::CreateROCMProviderOptions, _Outptr_ OrtROCMProvider
options->default_memory_arena_cfg = nullptr;
options->tunable_op_enable = 0;
options->tunable_op_tuning_enable = 0;
options->tunable_op_max_tuning_duration_ms = 0;
*out = options.release();
return nullptr;

View file

@ -57,6 +57,11 @@ class TestTuningContext : public ITuningContext {
void DisableTuning() override { tuning_enabled_ = false; }
bool IsTuningEnabled() const override { return tuning_enabled_; }
void SetMaxTuningDurationMs(int max_duration_ms) override { max_tuning_duration_ms_ = max_duration_ms; }
int GetMaxTuningDurationMs() const override {
return max_tuning_duration_ms_ > 0 ? max_tuning_duration_ms_ : std::numeric_limits<int>::max();
}
TuningResultsManager& GetTuningResultsManager() override { return manager_; }
const TuningResultsManager& GetTuningResultsManager() const override { return manager_; }
@ -67,6 +72,7 @@ class TestTuningContext : public ITuningContext {
private:
bool op_enabled_{false};
bool tuning_enabled_{false};
int max_tuning_duration_ms_{};
TuningResultsManager manager_{};
TestTuningResultsValidator validator_{};
};
@ -402,6 +408,13 @@ TEST(TunableOp, SelectFastIfTuning) {
status = op(&params);
ASSERT_TRUE(status.IsOK());
ASSERT_EQ(last_run, "FastFull");
// Also set max_tuning_duration_ms, fast should be selected
params.TuningContext()->SetMaxTuningDurationMs(10);
status = op(&params);
ASSERT_TRUE(status.IsOK());
ASSERT_EQ(last_run, "FastFull");
#endif
}

View file

@ -259,6 +259,8 @@ class TestInferenceSession(unittest.TestCase):
test_get_and_set_option_with_values("tunable_op_tuning_enable", ["1", "0"])
test_get_and_set_option_with_values("tunable_op_max_tuning_duration_ms", ["-1", "1"])
option["gpu_external_alloc"] = "0"
option["gpu_external_free"] = "0"
option["gpu_external_empty_cache"] = "0"
@ -395,6 +397,8 @@ class TestInferenceSession(unittest.TestCase):
test_get_and_set_option_with_values("tunable_op_tuning_enable", ["1", "0"])
test_get_and_set_option_with_values("tunable_op_max_tuning_duration_ms", ["-1", "1"])
runRocmOptionsTest()
def testInvalidSetProviders(self): # noqa: N802

View file

@ -191,6 +191,7 @@ std::unique_ptr<IExecutionProvider> DefaultRocmExecutionProvider(bool test_tunab
provider_options.do_copy_in_default_stream = true;
provider_options.tunable_op_enable = test_tunable_op ? 1 : 0;
provider_options.tunable_op_tuning_enable = test_tunable_op ? 1 : 0;
provider_options.tunable_op_max_tuning_duration_ms = 0;
if (auto factory = RocmProviderFactoryCreator::Create(&provider_options))
return factory->CreateProvider();
#endif