diff --git a/onnxruntime/core/platform/windows/debug_alloc.cc b/onnxruntime/core/platform/windows/debug_alloc.cc index 4e993ace4e..495e54e0a3 100644 --- a/onnxruntime/core/platform/windows/debug_alloc.cc +++ b/onnxruntime/core/platform/windows/debug_alloc.cc @@ -224,7 +224,8 @@ Memory_LeakCheck::~Memory_LeakCheck() { string.find("re2::RE2::Init") == std::string::npos && string.find("testing::internal::Mutex::ThreadSafeLazyInit") == std::string::npos && string.find("testing::internal::ThreadLocalRegistryImpl::GetThreadLocalsMapLocked") == std::string::npos && - string.find("testing::internal::ThreadLocalRegistryImpl::GetValueOnCurrentThread") == std::string::npos) { + string.find("testing::internal::ThreadLocalRegistryImpl::GetValueOnCurrentThread") == std::string::npos && + string.find("PyInit_onnxruntime_pybind11_state") == std::string::npos){ if (leaked_bytes == 0) DebugPrint("\n-----Starting Heap Trace-----\n\n"); diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc index 7a2aad88a1..9f1e098fd7 100644 --- a/onnxruntime/python/onnxruntime_pybind_state.cc +++ b/onnxruntime/python/onnxruntime_pybind_state.cc @@ -612,7 +612,7 @@ std::unique_ptr CreateExecutionProviderInstance( ORT_THROW("Invalid value passed for enable_vpu_fast_compile: ", option.second); } - } else if (option.first == "enable_opencl_throttling") { + } else if (option.first == "enable_opencl_throttling") { if (option.second == "True") { params.enable_opencl_throttling = true; } else if (option.second == "False") { @@ -768,7 +768,8 @@ std::unique_ptr CreateExecutionProviderInstance( #ifdef USE_QNN auto cit = provider_options_map.find(type); return onnxruntime::QNNProviderFactoryCreator::Create( - cit == provider_options_map.end() ? ProviderOptions{} : cit->second)->CreateProvider(); + cit == provider_options_map.end() ? ProviderOptions{} : cit->second) + ->CreateProvider(); #endif } else { // check whether it is a dynamic load EP: @@ -901,7 +902,7 @@ static void LogDeprecationWarning( } #endif -void addGlobalMethods(py::module& m, Environment& env) { +void addGlobalMethods(py::module& m) { m.def("get_default_session_options", &GetDefaultCPUSessionOptions, "Return a default session_options instance."); m.def("get_session_initializer", &SessionObjectInitializer::Get, "Return a default session object initializer."); m.def( @@ -911,16 +912,18 @@ void addGlobalMethods(py::module& m, Environment& env) { "set_seed", [](const int64_t seed) { utils::SetRandomSeed(seed); }, "Sets the seed used for random number generation in Onnxruntime."); m.def( - "set_default_logger_severity", [&env](int severity) { + "set_default_logger_severity", [](int severity) { ORT_ENFORCE(severity >= 0 && severity <= 4, "Invalid logging severity. 0:Verbose, 1:Info, 2:Warning, 3:Error, 4:Fatal"); - logging::LoggingManager* default_logging_manager = env.GetLoggingManager(); + auto env = GetEnv(); + logging::LoggingManager* default_logging_manager = env->GetLoggingManager(); default_logging_manager->SetDefaultLoggerSeverity(static_cast(severity)); }, "Sets the default logging severity. 0:Verbose, 1:Info, 2:Warning, 3:Error, 4:Fatal"); m.def( - "set_default_logger_verbosity", [&env](int vlog_level) { - logging::LoggingManager* default_logging_manager = env.GetLoggingManager(); + "set_default_logger_verbosity", [](int vlog_level) { + auto env = GetEnv(); + logging::LoggingManager* default_logging_manager = env->GetLoggingManager(); default_logging_manager->SetDefaultLoggerVerbosity(vlog_level); }, "Sets the default logging verbosity level. To activate the verbose log, " @@ -937,8 +940,9 @@ void addGlobalMethods(py::module& m, Environment& env) { "disable_telemetry_events", []() -> void { platform_env.GetTelemetryProvider().DisableTelemetryEvents(); }, "Disables platform-specific telemetry collection."); m.def( - "create_and_register_allocator", [&env](const OrtMemoryInfo& mem_info, const OrtArenaCfg* arena_cfg = nullptr) -> void { - auto st = env.CreateAndRegisterAllocator(mem_info, arena_cfg); + "create_and_register_allocator", [](const OrtMemoryInfo& mem_info, const OrtArenaCfg* arena_cfg = nullptr) -> void { + auto env = GetEnv(); + auto st = env->CreateAndRegisterAllocator(mem_info, arena_cfg); if (!st.IsOK()) { throw std::runtime_error("Error when creating and registering allocator: " + st.ErrorMessage()); } @@ -1034,7 +1038,7 @@ void addGlobalMethods(py::module& m, Environment& env) { #endif } -void addObjectMethods(py::module& m, Environment& env, ExecutionProviderRegistrationFn ep_registration_fn) { +void addObjectMethods(py::module& m, ExecutionProviderRegistrationFn ep_registration_fn) { py::enum_(m, "GraphOptimizationLevel") .value("ORT_DISABLE_ALL", GraphOptimizationLevel::ORT_DISABLE_ALL) .value("ORT_ENABLE_BASIC", GraphOptimizationLevel::ORT_ENABLE_BASIC) @@ -1077,33 +1081,33 @@ void addObjectMethods(py::module& m, Environment& env, ExecutionProviderRegistra // See docs/C_API.md for details on what the following parameters mean and how to choose these values ort_arena_cfg_binding.def(py::init([](size_t max_mem, int arena_extend_strategy_local, int initial_chunk_size_bytes, int max_dead_bytes_per_chunk) { - auto ort_arena_cfg = std::make_unique(); - ort_arena_cfg->max_mem = max_mem; - ort_arena_cfg->arena_extend_strategy = arena_extend_strategy_local; - ort_arena_cfg->initial_chunk_size_bytes = initial_chunk_size_bytes; - ort_arena_cfg->max_dead_bytes_per_chunk = max_dead_bytes_per_chunk; - return ort_arena_cfg; - })) + auto ort_arena_cfg = std::make_unique(); + ort_arena_cfg->max_mem = max_mem; + ort_arena_cfg->arena_extend_strategy = arena_extend_strategy_local; + ort_arena_cfg->initial_chunk_size_bytes = initial_chunk_size_bytes; + ort_arena_cfg->max_dead_bytes_per_chunk = max_dead_bytes_per_chunk; + return ort_arena_cfg; + })) .def(py::init([](const py::dict& feeds) { - auto ort_arena_cfg = std::make_unique(); - for (const auto kvp : feeds) { - std::string key = kvp.first.cast(); - if (key == "max_mem") { - ort_arena_cfg->max_mem = kvp.second.cast(); - } else if (key == "arena_extend_strategy") { - ort_arena_cfg->arena_extend_strategy = kvp.second.cast(); - } else if (key == "initial_chunk_size_bytes") { - ort_arena_cfg->initial_chunk_size_bytes = kvp.second.cast(); - } else if (key == "max_dead_bytes_per_chunk") { - ort_arena_cfg->max_dead_bytes_per_chunk = kvp.second.cast(); - } else if (key == "initial_growth_chunk_size_bytes") { - ort_arena_cfg->initial_growth_chunk_size_bytes = kvp.second.cast(); - } else { - ORT_THROW("Invalid OrtArenaCfg option: ", key); - } - } - return ort_arena_cfg; - })) + auto ort_arena_cfg = std::make_unique(); + for (const auto kvp : feeds) { + std::string key = kvp.first.cast(); + if (key == "max_mem") { + ort_arena_cfg->max_mem = kvp.second.cast(); + } else if (key == "arena_extend_strategy") { + ort_arena_cfg->arena_extend_strategy = kvp.second.cast(); + } else if (key == "initial_chunk_size_bytes") { + ort_arena_cfg->initial_chunk_size_bytes = kvp.second.cast(); + } else if (key == "max_dead_bytes_per_chunk") { + ort_arena_cfg->max_dead_bytes_per_chunk = kvp.second.cast(); + } else if (key == "initial_growth_chunk_size_bytes") { + ort_arena_cfg->initial_growth_chunk_size_bytes = kvp.second.cast(); + } else { + ORT_THROW("Invalid OrtArenaCfg option: ", key); + } + } + return ort_arena_cfg; + })) .def_readwrite("max_mem", &OrtArenaCfg::max_mem) .def_readwrite("arena_extend_strategy", &OrtArenaCfg::arena_extend_strategy) .def_readwrite("initial_chunk_size_bytes", &OrtArenaCfg::initial_chunk_size_bytes) @@ -1135,7 +1139,7 @@ void addObjectMethods(py::module& m, Environment& env, ExecutionProviderRegistra "enable_cpu_mem_arena", [](const PySessionOptions* options) -> bool { return options->value.enable_cpu_mem_arena; }, [](PySessionOptions* options, bool enable_cpu_mem_arena) -> void { - options->value.enable_cpu_mem_arena = enable_cpu_mem_arena; + options->value.enable_cpu_mem_arena = enable_cpu_mem_arena; }, R"pbdoc(Enables the memory arena on CPU. Arena may pre-allocate memory for future usage. Set this option to false if you don't want it. Default is True.)pbdoc") @@ -1143,13 +1147,13 @@ Set this option to false if you don't want it. Default is True.)pbdoc") "enable_profiling", [](const PySessionOptions* options) -> bool { return options->value.enable_profiling; }, [](PySessionOptions* options, bool enable_profiling) -> void { - options->value.enable_profiling = enable_profiling; + options->value.enable_profiling = enable_profiling; }, R"pbdoc(Enable profiling for this session. Default is false.)pbdoc") .def_property( "profile_file_prefix", [](const PySessionOptions* options) -> std::basic_string { - return options->value.profile_file_prefix; + return options->value.profile_file_prefix; }, [](PySessionOptions* options, std::basic_string profile_file_prefix) -> void { options->value.profile_file_prefix = std::move(profile_file_prefix); @@ -1175,14 +1179,14 @@ Serialized model format will default to ONNX unless: "enable_mem_pattern", [](const PySessionOptions* options) -> bool { return options->value.enable_mem_pattern; }, [](PySessionOptions* options, bool enable_mem_pattern) -> void { - options->value.enable_mem_pattern = enable_mem_pattern; + options->value.enable_mem_pattern = enable_mem_pattern; }, R"pbdoc(Enable the memory pattern optimization. Default is true.)pbdoc") .def_property( "enable_mem_reuse", [](const PySessionOptions* options) -> bool { return options->value.enable_mem_reuse; }, [](PySessionOptions* options, bool enable_mem_reuse) -> void { - options->value.enable_mem_reuse = enable_mem_reuse; + options->value.enable_mem_reuse = enable_mem_reuse; }, R"pbdoc(Enable the memory reuse optimization. Default is true.)pbdoc") .def_property( @@ -1198,7 +1202,7 @@ Serialized model format will default to ONNX unless: "log_severity_level", [](const PySessionOptions* options) -> int { return options->value.session_log_severity_level; }, [](PySessionOptions* options, int log_severity_level) -> void { - options->value.session_log_severity_level = log_severity_level; + options->value.session_log_severity_level = log_severity_level; }, R"pbdoc(Log severity level. Applies to session load, initialization, etc. 0:Verbose, 1:Info, 2:Warning. 3:Error, 4:Fatal. Default is 2.)pbdoc") @@ -1224,7 +1228,7 @@ Applies to session load, initialization, etc. Default is 0.)pbdoc") "execution_mode", [](const PySessionOptions* options) -> ExecutionMode { return options->value.execution_mode; }, [](PySessionOptions* options, ExecutionMode execution_mode) -> void { - options->value.execution_mode = execution_mode; + options->value.execution_mode = execution_mode; }, R"pbdoc(Sets the execution mode. Default is sequential.)pbdoc") .def_property( @@ -1280,7 +1284,7 @@ Applies to session load, initialization, etc. Default is 0.)pbdoc") "use_deterministic_compute", [](const PySessionOptions* options) -> bool { return options->value.use_deterministic_compute; }, [](PySessionOptions* options, bool use_deterministic_compute) -> void { - options->value.use_deterministic_compute = use_deterministic_compute; + options->value.use_deterministic_compute = use_deterministic_compute; }, R"pbdoc(Whether to use deterministic compute. Default is false.)pbdoc") .def( @@ -1477,15 +1481,16 @@ including arg name, arg type (contains both type and shape).)pbdoc") py::class_(m, "InferenceSession", R"pbdoc(This is the main class used to run a model.)pbdoc") // In Python3, a Python bytes object will be passed to C++ functions that accept std::string or char* // without any conversion. So this init method can be used for model file path (string) and model content (bytes) - .def(py::init([&env](const PySessionOptions& so, const std::string arg, bool is_arg_file_name, - bool load_config_from_model = false) { + .def(py::init([](const PySessionOptions& so, const std::string arg, bool is_arg_file_name, + bool load_config_from_model = false) { + auto env = GetEnv(); std::unique_ptr sess; // separate creation of the session from model loading unless we have to read the config from the model. // in a minimal build we only support load via Load(...) and not at session creation time if (load_config_from_model) { #if !defined(ORT_MINIMAL_BUILD) - sess = std::make_unique(env, so, arg, is_arg_file_name); + sess = std::make_unique(std::move(env), so, arg, is_arg_file_name); RegisterCustomOpDomains(sess.get(), so); @@ -1494,7 +1499,7 @@ including arg name, arg type (contains both type and shape).)pbdoc") ORT_THROW("Loading configuration from an ONNX model is not supported in this build."); #endif } else { - sess = std::make_unique(env, so); + sess = std::make_unique(std::move(env), so); #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS) RegisterCustomOpDomains(sess.get(), so); #endif @@ -1600,21 +1605,10 @@ including arg name, arg type (contains both type and shape).)pbdoc") } return fetches; }) - .def("run_with_ortvaluevector", []( - PyInferenceSession* sess, - RunOptions run_options, - const std::vector& feed_names, - const std::vector& feeds, - const std::vector& fetch_names, - std::vector& fetches, - const std::vector& fetch_devices) -> void { - - { - // release GIL to allow multiple python threads to invoke Run() in parallel. - py::gil_scoped_release release; - OrtPybindThrowIfError(sess->GetSessionHandle()->Run(run_options, feed_names, feeds, fetch_names, &fetches, &fetch_devices)); - } - + .def("run_with_ortvaluevector", [](PyInferenceSession* sess, RunOptions run_options, const std::vector& feed_names, const std::vector& feeds, const std::vector& fetch_names, std::vector& fetches, const std::vector& fetch_devices) -> void { + // release GIL to allow multiple python threads to invoke Run() in parallel. + py::gil_scoped_release release; + OrtPybindThrowIfError(sess->GetSessionHandle()->Run(run_options, feed_names, feeds, fetch_names, &fetches, &fetch_devices)); }) .def("end_profiling", [](const PyInferenceSession* sess) -> std::string { return sess->GetSessionHandle()->EndProfiling(); @@ -1691,27 +1685,27 @@ including arg name, arg type (contains both type and shape).)pbdoc") return ret; #else - ORT_UNUSED_PARAMETER(sess); - ORT_THROW("TunableOp and get_tuning_results are not supported in this build."); + ORT_UNUSED_PARAMETER(sess); + ORT_THROW("TunableOp and get_tuning_results are not supported in this build."); #endif }) .def("set_tuning_results", [](PyInferenceSession* sess, py::list results, bool error_on_invalid) -> void { #if !defined(ORT_MINIMAL_BUILD) std::vector tuning_results; - for (auto handle: results) { + for (auto handle : results) { auto py_trs = handle.cast(); TuningResults trs; trs.ep = py_trs["ep"].cast(); - for (const auto [py_op_sig, py_kernel_map]: py_trs["results"].cast()) { + for (const auto [py_op_sig, py_kernel_map] : py_trs["results"].cast()) { KernelMap kernel_map; - for (const auto [py_params_sig, py_kernel_id]: py_kernel_map.cast()) { + for (const auto [py_params_sig, py_kernel_id] : py_kernel_map.cast()) { kernel_map[py_params_sig.cast()] = py_kernel_id.cast(); } trs.results[py_op_sig.cast()] = kernel_map; } - for (const auto [k, v]: py_trs["validators"].cast()) { + for (const auto [k, v] : py_trs["validators"].cast()) { trs.validators[k.cast()] = v.cast(); } @@ -1723,10 +1717,10 @@ including arg name, arg type (contains both type and shape).)pbdoc") throw std::runtime_error("Error in execution: " + status.ErrorMessage()); } #else - ORT_UNUSED_PARAMETER(sess); - ORT_UNUSED_PARAMETER(results); - ORT_UNUSED_PARAMETER(error_on_invalid); - ORT_THROW("TunableOp and set_tuning_results are not supported in this build."); + ORT_UNUSED_PARAMETER(sess); + ORT_UNUSED_PARAMETER(results); + ORT_UNUSED_PARAMETER(error_on_invalid); + ORT_THROW("TunableOp and set_tuning_results are not supported in this build."); #endif }); @@ -1746,10 +1740,10 @@ void CreateInferencePybindStateModule(py::module& m) { import_array1(); })(); - Environment& env = GetEnv(); + auto env = GetEnv(); - addGlobalMethods(m, env); - addObjectMethods(m, env, RegisterExecutionProviders); + addGlobalMethods(m); + addObjectMethods(m, RegisterExecutionProviders); addOrtValueMethods(m); addSparseTensorMethods(m); addIoBindingMethods(m); @@ -1775,34 +1769,77 @@ void InitArray() { })(); } -// static variable used to create inference session and training session. -static std::unique_ptr session_env; +namespace { +// This class provides a static shell for on-demand and thread-safe construction +// of Environment object for both Inference and Training python layers. +// Environment class contains objects such as default logger, that must be available +// for the entire duration of a program that makes use of onnxruntime library. +// Because Python is a garbage collected language and the order of destruction of objects +// is not guaranteed we design this class with the following important features. -void InitializeEnv() { - auto initialize = [&]() { +// 1) we make this class a singleton that is a function local static. The function local statics +// are constructed when the function is called the very first time. This fact has several important +// properties. +// - First, it is constructed before it is first needed possibly by another static object +// and destroyed after that object is destroyed. +// - Second, it is constructed in a thread safe manner. +// - Last, this order of construction/destruction is enforced across the compilation units, as opposed +// to the static objects that are simply declared in order in a single unit, but their lifespan is +// unconnected to that of in other compilation units. This is achieved automatically by run-time +// by execution atexit() to build a chain. +// 2) We make Environment owned by a shared_ptr. This is done because python objects such as Inference and Training +// sessions depend on this global. We acquire a shared_ptr instance when those objects are instantiated +// and release it automatically when they are garbage collected. Although with this change all of the +// globals seem to have been destroyed after module is unloaded and GC runs before that, it is cheap and gives +// a piece of mind as there were situations when GC was still running in the past after Env was gone. +// TrainingEnv global also holds shared reference to this global. +// 3) We guard against singleton resurrection attempts to detect code runs that when it should +// not and make necessary adjustments. +// For all the related details and why it is needed see "Modern C++ design" by A. Alexandrescu Chapter 6. +class EnvInitializer { + public: + static std::shared_ptr SharedInstance() { + // Guard against attempts to resurrect the singleton + if (EnvInitializer::destroyed) { + ORT_THROW("Detected an attempt to resurrect destroyed Environment"); + } + static EnvInitializer env_holder; + return env_holder.Get(); + } + + private: + EnvInitializer() { // Initialization of the module InitArray(); + std::unique_ptr env_ptr; Env::Default().GetTelemetryProvider().SetLanguageProjection(OrtLanguageProjection::ORT_PROJECTION_PYTHON); OrtPybindThrowIfError(Environment::Create(std::make_unique( std::make_unique(), Severity::kWARNING, false, LoggingManager::InstanceType::Default, &SessionObjectInitializer::default_logger_id), - session_env)); - - static bool initialized = false; - if (initialized) { - return; - } - initialized = true; - }; - initialize(); -} - -onnxruntime::Environment& GetEnv() { - if (!session_env) { - InitializeEnv(); + env_ptr)); + session_env_ = std::shared_ptr(env_ptr.release()); + destroyed = false; } - return *session_env; + + ~EnvInitializer() { + destroyed = true; + } + + std::shared_ptr Get() const { + return session_env_; + } + + std::shared_ptr session_env_; + + static bool destroyed; +}; + +bool EnvInitializer::destroyed = false; +} // namespace + +std::shared_ptr GetEnv() { + return EnvInitializer::SharedInstance(); } } // namespace python diff --git a/onnxruntime/python/onnxruntime_pybind_state_common.h b/onnxruntime/python/onnxruntime_pybind_state_common.h index add6c0afd1..6943501e0b 100644 --- a/onnxruntime/python/onnxruntime_pybind_state_common.h +++ b/onnxruntime/python/onnxruntime_pybind_state_common.h @@ -232,33 +232,36 @@ using PySessionOptions = OrtSessionOptions; // Thin wrapper over internal C++ InferenceSession to accommodate custom op library management for the Python user struct PyInferenceSession { - PyInferenceSession(Environment& env, const PySessionOptions& so) { - sess_ = std::make_unique(so.value, env); + PyInferenceSession(std::shared_ptr env, const PySessionOptions& so) + : env_(std::move(env)) { + sess_ = std::make_unique(so.value, *env_); } #if !defined(ORT_MINIMAL_BUILD) - PyInferenceSession(Environment& env, const PySessionOptions& so, const std::string& arg, bool is_arg_file_name) { + PyInferenceSession(std::shared_ptr env, const PySessionOptions& so, const std::string& arg, bool is_arg_file_name) + : env_(std::move(env)) { if (is_arg_file_name) { // Given arg is the file path. Invoke the corresponding ctor(). - sess_ = std::make_unique(so.value, env, arg); + sess_ = std::make_unique(so.value, *env_, arg); } else { // Given arg is the model content as bytes. Invoke the corresponding ctor(). std::istringstream buffer(arg); - sess_ = std::make_unique(so.value, env, buffer); + sess_ = std::make_unique(so.value, *env_, buffer); } } #endif InferenceSession* GetSessionHandle() const { return sess_.get(); } - virtual ~PyInferenceSession() {} + virtual ~PyInferenceSession() = default; protected: - PyInferenceSession(std::unique_ptr sess) { - sess_ = std::move(sess); + PyInferenceSession(std::shared_ptr env, std::unique_ptr sess) + : env_(std::move(env)), sess_(std::move(sess)) { } private: + std::shared_ptr env_; std::unique_ptr sess_; }; @@ -383,7 +386,7 @@ class SessionObjectInitializer { #if defined(_MSC_VER) && !defined(__clang__) #pragma warning(pop) #endif -Environment& GetEnv(); +std::shared_ptr GetEnv(); // Initialize an InferenceSession. // Any provider_options should have entries in matching order to provider_types. diff --git a/orttraining/orttraining/lazy_tensor/accelerator.cc b/orttraining/orttraining/lazy_tensor/accelerator.cc index 05102b74c1..d1b407a5af 100644 --- a/orttraining/orttraining/lazy_tensor/accelerator.cc +++ b/orttraining/orttraining/lazy_tensor/accelerator.cc @@ -26,10 +26,6 @@ namespace onnxruntime { -namespace python { -Environment& GetTrainingORTEnv(); -} - namespace lazytensor { namespace py = pybind11; @@ -304,11 +300,11 @@ static std::unique_ptr CreateSession() { #ifdef USE_CUDA NvtxRange range(__func__); #endif - // Environment shared by all sessions. - static onnxruntime::Environment& pybind_default_env = onnxruntime::python::GetTrainingORTEnv(); // All sessions use the same config. static onnxruntime::SessionOptions sess_opts; - return std::make_unique(sess_opts, pybind_default_env); + // Query the singleton always, to make sure we detect shutdown + auto ort_env = onnxruntime::python::GetEnv(); + return std::make_unique(sess_opts, *ort_env); } static OrtDevice CheckAndGetTensorDevice(const at::ArrayRef& values) { diff --git a/orttraining/orttraining/python/orttraining_pybind_common.h b/orttraining/orttraining/python/orttraining_pybind_common.h index c3a5422c22..46fe3efd41 100644 --- a/orttraining/orttraining/python/orttraining_pybind_common.h +++ b/orttraining/orttraining/python/orttraining_pybind_common.h @@ -22,7 +22,7 @@ class ORTTrainingPythonEnv { public: ORTTrainingPythonEnv(); - Environment& GetORTEnv(); + std::shared_ptr GetORTEnv() const; std::shared_ptr GetExecutionProviderInstance(const std::string& provider_type, size_t hash); @@ -45,7 +45,7 @@ class ORTTrainingPythonEnv { std::string GetExecutionProviderMapKey(const std::string& provider_type, size_t hash); - std::unique_ptr ort_env_; + std::shared_ptr ort_env_; ExecutionProviderMap execution_provider_instances_map_; std::vector available_training_eps_; }; diff --git a/orttraining/orttraining/python/orttraining_pybind_state.cc b/orttraining/orttraining/python/orttraining_pybind_state.cc index 14a407dc0c..71c0fe5654 100644 --- a/orttraining/orttraining/python/orttraining_pybind_state.cc +++ b/orttraining/orttraining/python/orttraining_pybind_state.cc @@ -49,7 +49,6 @@ using namespace onnxruntime; using namespace onnxruntime::logging; using namespace onnxruntime::training; -Environment& GetTrainingORTEnv(); ORTTrainingPythonEnv& GetTrainingEnv(); void ResolveExtraProviderOptions(const std::vector& provider_types, @@ -169,9 +168,13 @@ struct TrainingConfigurationResult { struct PyOptimizer { PyOptimizer(const std::string optimizer_model_uri, onnxruntime::training::api::Module* model, std::vector> provider) - : optimizer_(std::make_unique(optimizer_model_uri, - model->NamedParameters(), onnxruntime::SessionOptions(), - GetTrainingORTEnv(), provider)) { + : optimizer_() { + + auto env = GetTrainingEnv().GetORTEnv(); + // XXX: We hope that env will be around when optimizer needs it. + optimizer_ = std::make_shared(optimizer_model_uri, + model->NamedParameters(), onnxruntime::SessionOptions(), + *env, provider); } std::shared_ptr optimizer_; @@ -549,20 +552,21 @@ void addObjectMethodsForTraining(py::module& m, ExecutionProviderRegistrationFn // Thin wrapper over internal C++ InferenceSession to accommodate custom op library management for the Python user struct PyTrainingSession : public PyInferenceSession { - PyTrainingSession(Environment& env, const PySessionOptions& so) - : PyInferenceSession(std::make_unique(so.value, env)) { + PyTrainingSession(std::shared_ptr env, const PySessionOptions& so) + : PyInferenceSession(env, std::make_unique(so.value, *env)) { } + ~PyTrainingSession() = default; }; py::class_ training_session(m, "TrainingSession"); training_session .def(py::init([](const PySessionOptions& so) { - Environment& env = GetTrainingORTEnv(); - return std::make_unique(env, so); + auto& training_env = GetTrainingEnv(); + return std::make_unique(training_env.GetORTEnv(), so); })) .def(py::init([]() { - Environment& env = GetTrainingORTEnv(); - return std::make_unique(env, GetDefaultCPUSessionOptions()); + auto& training_env = GetTrainingEnv(); + return std::make_unique(training_env.GetORTEnv(), GetDefaultCPUSessionOptions()); })) .def("finalize", [](py::object) { #if defined(USE_MPI) @@ -876,10 +880,11 @@ void addObjectMethodsForTraining(py::module& m, ExecutionProviderRegistrationFn onnxruntime::SessionOptions session_option; std::vector> provider = GetExecutionProvidersForTrainingApis(device); + auto env = GetTrainingEnv().GetORTEnv(); return std::make_unique( model_uri, state.module_checkpoint_state.named_parameters, session_option, - GetTrainingORTEnv(), provider, eval_model_uri); + *env, provider, eval_model_uri); })) .def("train_step", [](onnxruntime::training::api::Module* model, diff --git a/orttraining/orttraining/python/orttraining_python_module.cc b/orttraining/orttraining/python/orttraining_python_module.cc index f2dbb27072..40caa07684 100644 --- a/orttraining/orttraining/python/orttraining_python_module.cc +++ b/orttraining/orttraining/python/orttraining_python_module.cc @@ -36,8 +36,8 @@ const ROCMExecutionProviderInfo GetRocmExecutionProviderInfo(ProviderInfo_ROCM* const ProviderOptionsMap& provider_options_map); #endif -void addGlobalMethods(py::module& m, Environment& env); -void addObjectMethods(py::module& m, Environment& env, ExecutionProviderRegistrationFn ep_registration_fn); +void addGlobalMethods(py::module& m); +void addObjectMethods(py::module& m, ExecutionProviderRegistrationFn ep_registration_fn); void addObjectMethodsForTraining(py::module& m, ExecutionProviderRegistrationFn ep_registration_fn); void addObjectMethodsForEager(py::module& m); #ifdef ENABLE_LAZY_TENSOR @@ -133,18 +133,13 @@ bool GetProviderInstanceHash(const std::string& type, return false; } -ORTTrainingPythonEnv::ORTTrainingPythonEnv() { - OrtPybindThrowIfError(Environment::Create(std::make_unique( - std::make_unique(), - Severity::kWARNING, false, LoggingManager::InstanceType::Default, - &SessionObjectInitializer::default_logger_id), - ort_env_)); - auto& builtinEPs = GetAvailableExecutionProviderNames(); +ORTTrainingPythonEnv::ORTTrainingPythonEnv() : ort_env_(GetEnv()) { + const auto& builtinEPs = GetAvailableExecutionProviderNames(); available_training_eps_.assign(builtinEPs.begin(), builtinEPs.end()); } -Environment& ORTTrainingPythonEnv::GetORTEnv() { - return *ort_env_; +std::shared_ptr ORTTrainingPythonEnv::GetORTEnv() const { + return ort_env_; } std::shared_ptr ORTTrainingPythonEnv::GetExecutionProviderInstance(const std::string& provider_type, @@ -183,37 +178,82 @@ void ORTTrainingPythonEnv::ClearExecutionProviderInstances() { execution_provider_instances_map_.clear(); } -static std::unique_ptr ort_training_env; +namespace { -void InitializeTrainingEnv() { - auto initialize = [&]() { - static bool initialized = false; - if (initialized) { - return; +// This class provides a static shell for on-demand and thread-safe construction +// of ORTTrainingPythonEnv object for both Inference and Training python layers. +// ORTTrainingPythonEnv class contains instances of execution providers that have been +// instantiated for training purposes. It depends on the Environment singleton to which it +// holds a shared_ptr instance. +// +// 1) we make this class a singleton that is a function local static. The function local statics +// are constructed when the function is called the very first time. This fact has several important +// properties. +// - First, it is constructed before it is first needed possibly by another static object +// and destroyed after that object is destroyed. +// - Second, it is constructed in a thread safe manner. +// - Last, this order of construction/destruction is enforced across the compilation units, as opposed +// to the static objects that are simply declared in order in a single unit, but their lifespan is +// unconnected to that of in other compilation units. This is achieved automatically by run-time +// by execution atexit() to build a chain. +// 2) This ORTTrainingPythonEnv is currently owned by a unique_ptr unlike the Environment singleton. This is +// because we currently do not see a need to refer to it by any of the Python objects or by other singletons. +// With this change this singleton is properly destroyed after python module is unloaded, but before the Environment. +// HOWEVER, because it holds instances of execution providers, we want to make sure that those instances are destroyed +// before those depended EP DLLs are unloaded so EP destructor can run. +// This static is destroyed when this compilation unit is unloaded and it generally happens +// AFTER EP dlls are unloaded. To mitigate that, we clear EP instances using python `atexit` (different from C atexit()) +// mechanism which takes place after all python objects are GCed but before any DLLs are unloaded or +// runtime starts destroying globals. +// 3) We guard against singleton resurrection attempts to detect code that runs when it should not +// and make necessary adjustments. +// For all the related details and why it is needed see "Modern C++ design" by A. Alexandrescu Chapter 6. +class TrainingEnvInitialzer { + public: + + static ORTTrainingPythonEnv& Instance() { + // Guard against attempts to resurrect the singleton + if (TrainingEnvInitialzer::destroyed) { + ORT_THROW("Detected an attempt to resurrect destroyed Training Environment"); } - // Initialization of the module + + static TrainingEnvInitialzer training_env_holder; + + return training_env_holder.Get(); + } + + private: + + TrainingEnvInitialzer() { InitArray(); Env::Default().GetTelemetryProvider().SetLanguageProjection(OrtLanguageProjection::ORT_PROJECTION_PYTHON); - ort_training_env = std::make_unique(); - initialized = true; - }; - initialize(); -} + ort_training_env_ = std::make_unique(); + } + + ~TrainingEnvInitialzer() { + destroyed = true; + } + + ORTTrainingPythonEnv& Get() noexcept { + return *ort_training_env_; + } + + std::unique_ptr ort_training_env_; + + static bool destroyed; +}; + +bool TrainingEnvInitialzer::destroyed = false; + +} // namespace ORTTrainingPythonEnv& GetTrainingEnv() { - if (!ort_training_env) { - InitializeTrainingEnv(); - } - return *ort_training_env; -} - -Environment& GetTrainingORTEnv() { - if (!ort_training_env) { - InitializeTrainingEnv(); - } - return ort_training_env->GetORTEnv(); + return TrainingEnvInitialzer::Instance(); } +// TODO: If this global has a conflicting lifespan with other globals +// such as Environment, follow the global objects management pattern for +// Environment and ORTTrainingPythonEnv #ifdef ENABLE_EAGER_MODE using namespace torch_ort::eager; static std::unique_ptr ort_backends_manager_instance; @@ -225,8 +265,9 @@ void InitializeBackendsManager() { return; } // Initialization of the module - auto& env = onnxruntime::python::GetTrainingORTEnv(); - ort_backends_manager_instance = std::make_unique(env.GetLoggingManager()->DefaultLogger()); + auto& training_env = onnxruntime::python::GetTrainingEnv(); + auto env = training_env.GetORTEnv(); + ort_backends_manager_instance = std::make_unique(env->GetLoggingManager()->DefaultLogger()); initialized = true; }; initialize(); @@ -247,7 +288,7 @@ void ResolveExtraProviderOptions(const std::vector& provider_types, for (auto& provider_type : provider_types) { auto it = training_env.ext_execution_provider_info_map_.find(provider_type); if (it == training_env.ext_execution_provider_info_map_.end()) { - //nothing changed. + // nothing changed. if (original_provider_options_map.find(provider_type) != original_provider_options_map.end()) merged_options.insert({provider_type, original_provider_options_map.at(provider_type)}); } else { @@ -318,9 +359,10 @@ PYBIND11_MODULE(onnxruntime_pybind11_state, m) { m.doc() = "pybind11 stateful interface to ORTTraining"; RegisterExceptions(m); - Environment& env = GetTrainingORTEnv(); - addGlobalMethods(m, env); - addObjectMethods(m, env, ORTTrainingRegisterExecutionProviders); + // Instantiate singletons + GetTrainingEnv(); + addGlobalMethods(m); + addObjectMethods(m, ORTTrainingRegisterExecutionProviders); addOrtValueMethods(m); addSparseTensorMethods(m); addIoBindingMethods(m); @@ -357,19 +399,20 @@ PYBIND11_MODULE(onnxruntime_pybind11_state, m) { m.def("get_version_string", []() -> std::string { return ORT_VERSION; }); - m.def("clear_training_ep_instances", []() -> void { - ort_training_env->ClearExecutionProviderInstances(); - }, - "Clean the execution provider instances used in ort training module."); + m.def( + "clear_training_ep_instances", []() -> void { + GetTrainingEnv().ClearExecutionProviderInstances(); + }, + "Clean the execution provider instances used in ort training module."); - // clean the ort training environment when python interpreter exit - // otherwise the global var will be de-constrcut after user main. - // the order of ort training environment deconstruction and cudart - // deconstruction is not stable, which will lead to crash. + // See documentation for class TrainingEnvInitialzer earlier in this module + // for an explanation as to why this is needed. auto atexit = py::module_::import("atexit"); atexit.attr("register")(py::cpp_function([]() { - ort_training_env = nullptr; + GetTrainingEnv().ClearExecutionProviderInstances(); #ifdef ENABLE_EAGER_MODE + // This singleton should also be re-factored into a function local static + // so its lifetime is properly managed. ort_backends_manager_instance = nullptr; #endif }));