Re-work global objects dependancies in pybind layer. (#14941)

### Description Re-work handling of static objects in pybind. Make sure we ref-count Environment from Sessions. The following has been done: - Make global objects function static. This ensures that the objects are constructed on demand. The first object constructed is destructed last. This is platform independent. - Make global objects ownership shared as suggested by pybind since they are not surfaced at Python level, and they cannot be referred to by dependent python objects. Verified that all python objects are GCed before globals are destroyed. This takes care of inference session dependency on environment and its default logger and this is also platform independent. - Utilize pybind atexit mechanism to clear execution providers and unload CUDA libraries (as suggested by https://github.com/microsoft/onnxruntime/pull/14903) . Since this is registered for module exit, it takes place before any other global are destroyed and clears shared objects state or even unloads the libraries. This should also work in a platform independent way. ### Motivation and Context - Global object destruction order is managed manually and that becomes source of trouble. We want to make it deterministic and platform independent. - Frequent hangs in Python layer due to the static object's destruction order. Some of the Python session objects are being garbage collected after main exits and they require ORT environment to be alive. (Use after free)
2026-07-21 19:18:55 +00:00 · 2023-03-10 13:55:31 -08:00 · 2023-03-10 13:55:31 -08:00 · 0d7855ea5a
commit 0d7855ea5a
parent e2febe87f6
7 changed files with 259 additions and 174 deletions
--- a/onnxruntime/core/platform/windows/debug_alloc.cc
+++ b/onnxruntime/core/platform/windows/debug_alloc.cc
@ -224,7 +224,8 @@ Memory_LeakCheck::~Memory_LeakCheck() {
        string.find("re2::RE2::Init") == std::string::npos &&
        string.find("testing::internal::Mutex::ThreadSafeLazyInit") == std::string::npos &&
        string.find("testing::internal::ThreadLocalRegistryImpl::GetThreadLocalsMapLocked") == std::string::npos &&
-        string.find("testing::internal::ThreadLocalRegistryImpl::GetValueOnCurrentThread") == std::string::npos) {
+        string.find("testing::internal::ThreadLocalRegistryImpl::GetValueOnCurrentThread") == std::string::npos &&
+        string.find("PyInit_onnxruntime_pybind11_state") == std::string::npos){
      if (leaked_bytes == 0)
        DebugPrint("\n-----Starting Heap Trace-----\n\n");

--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@ -612,7 +612,7 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
            ORT_THROW("Invalid value passed for enable_vpu_fast_compile: ", option.second);
          }

-        }  else if (option.first == "enable_opencl_throttling") {
+        } else if (option.first == "enable_opencl_throttling") {
          if (option.second == "True") {
            params.enable_opencl_throttling = true;
          } else if (option.second == "False") {
@ -768,7 +768,8 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
 #ifdef USE_QNN
    auto cit = provider_options_map.find(type);
    return onnxruntime::QNNProviderFactoryCreator::Create(
-               cit == provider_options_map.end() ? ProviderOptions{} : cit->second)->CreateProvider();
+               cit == provider_options_map.end() ? ProviderOptions{} : cit->second)
+        ->CreateProvider();
 #endif
  } else {
    // check whether it is a dynamic load EP:
@ -901,7 +902,7 @@ static void LogDeprecationWarning(
 }
 #endif

-void addGlobalMethods(py::module& m, Environment& env) {
+void addGlobalMethods(py::module& m) {
  m.def("get_default_session_options", &GetDefaultCPUSessionOptions, "Return a default session_options instance.");
  m.def("get_session_initializer", &SessionObjectInitializer::Get, "Return a default session object initializer.");
  m.def(
@ -911,16 +912,18 @@ void addGlobalMethods(py::module& m, Environment& env) {
      "set_seed", [](const int64_t seed) { utils::SetRandomSeed(seed); },
      "Sets the seed used for random number generation in Onnxruntime.");
  m.def(
-      "set_default_logger_severity", [&env](int severity) {
+      "set_default_logger_severity", [](int severity) {
        ORT_ENFORCE(severity >= 0 && severity <= 4,
                    "Invalid logging severity. 0:Verbose, 1:Info, 2:Warning, 3:Error, 4:Fatal");
-        logging::LoggingManager* default_logging_manager = env.GetLoggingManager();
+        auto env = GetEnv();
+        logging::LoggingManager* default_logging_manager = env->GetLoggingManager();
        default_logging_manager->SetDefaultLoggerSeverity(static_cast<logging::Severity>(severity));
      },
      "Sets the default logging severity. 0:Verbose, 1:Info, 2:Warning, 3:Error, 4:Fatal");
  m.def(
-      "set_default_logger_verbosity", [&env](int vlog_level) {
-        logging::LoggingManager* default_logging_manager = env.GetLoggingManager();
+      "set_default_logger_verbosity", [](int vlog_level) {
+        auto env = GetEnv();
+        logging::LoggingManager* default_logging_manager = env->GetLoggingManager();
        default_logging_manager->SetDefaultLoggerVerbosity(vlog_level);
      },
      "Sets the default logging verbosity level. To activate the verbose log, "
@ -937,8 +940,9 @@ void addGlobalMethods(py::module& m, Environment& env) {
      "disable_telemetry_events", []() -> void { platform_env.GetTelemetryProvider().DisableTelemetryEvents(); },
      "Disables platform-specific telemetry collection.");
  m.def(
-      "create_and_register_allocator", [&env](const OrtMemoryInfo& mem_info, const OrtArenaCfg* arena_cfg = nullptr) -> void {
-        auto st = env.CreateAndRegisterAllocator(mem_info, arena_cfg);
+      "create_and_register_allocator", [](const OrtMemoryInfo& mem_info, const OrtArenaCfg* arena_cfg = nullptr) -> void {
+        auto env = GetEnv();
+        auto st = env->CreateAndRegisterAllocator(mem_info, arena_cfg);
        if (!st.IsOK()) {
          throw std::runtime_error("Error when creating and registering allocator: " + st.ErrorMessage());
        }
@ -1034,7 +1038,7 @@ void addGlobalMethods(py::module& m, Environment& env) {
 #endif
 }

-void addObjectMethods(py::module& m, Environment& env, ExecutionProviderRegistrationFn ep_registration_fn) {
+void addObjectMethods(py::module& m, ExecutionProviderRegistrationFn ep_registration_fn) {
  py::enum_<GraphOptimizationLevel>(m, "GraphOptimizationLevel")
      .value("ORT_DISABLE_ALL", GraphOptimizationLevel::ORT_DISABLE_ALL)
      .value("ORT_ENABLE_BASIC", GraphOptimizationLevel::ORT_ENABLE_BASIC)
@ -1077,33 +1081,33 @@ void addObjectMethods(py::module& m, Environment& env, ExecutionProviderRegistra
  // See docs/C_API.md for details on what the following parameters mean and how to choose these values
  ort_arena_cfg_binding.def(py::init([](size_t max_mem, int arena_extend_strategy_local,
                                        int initial_chunk_size_bytes, int max_dead_bytes_per_chunk) {
-    auto ort_arena_cfg = std::make_unique<OrtArenaCfg>();
-    ort_arena_cfg->max_mem = max_mem;
-    ort_arena_cfg->arena_extend_strategy = arena_extend_strategy_local;
-    ort_arena_cfg->initial_chunk_size_bytes = initial_chunk_size_bytes;
-    ort_arena_cfg->max_dead_bytes_per_chunk = max_dead_bytes_per_chunk;
-    return ort_arena_cfg;
-  }))
+                         auto ort_arena_cfg = std::make_unique<OrtArenaCfg>();
+                         ort_arena_cfg->max_mem = max_mem;
+                         ort_arena_cfg->arena_extend_strategy = arena_extend_strategy_local;
+                         ort_arena_cfg->initial_chunk_size_bytes = initial_chunk_size_bytes;
+                         ort_arena_cfg->max_dead_bytes_per_chunk = max_dead_bytes_per_chunk;
+                         return ort_arena_cfg;
+                       }))
      .def(py::init([](const py::dict& feeds) {
-    auto ort_arena_cfg = std::make_unique<OrtArenaCfg>();
-    for (const auto kvp : feeds) {
-      std::string key = kvp.first.cast<std::string>();
-      if (key == "max_mem") {
-        ort_arena_cfg->max_mem = kvp.second.cast<size_t>();
-      } else if (key == "arena_extend_strategy") {
-        ort_arena_cfg->arena_extend_strategy = kvp.second.cast<int>();
-      } else if (key == "initial_chunk_size_bytes") {
-        ort_arena_cfg->initial_chunk_size_bytes = kvp.second.cast<int>();
-      } else if (key == "max_dead_bytes_per_chunk") {
-        ort_arena_cfg->max_dead_bytes_per_chunk = kvp.second.cast<int>();
-      } else if (key == "initial_growth_chunk_size_bytes") {
-        ort_arena_cfg->initial_growth_chunk_size_bytes = kvp.second.cast<int>();
-        } else {
-        ORT_THROW("Invalid OrtArenaCfg option: ", key);
-      }
-    }
-    return ort_arena_cfg;
-  }))
+        auto ort_arena_cfg = std::make_unique<OrtArenaCfg>();
+        for (const auto kvp : feeds) {
+          std::string key = kvp.first.cast<std::string>();
+          if (key == "max_mem") {
+            ort_arena_cfg->max_mem = kvp.second.cast<size_t>();
+          } else if (key == "arena_extend_strategy") {
+            ort_arena_cfg->arena_extend_strategy = kvp.second.cast<int>();
+          } else if (key == "initial_chunk_size_bytes") {
+            ort_arena_cfg->initial_chunk_size_bytes = kvp.second.cast<int>();
+          } else if (key == "max_dead_bytes_per_chunk") {
+            ort_arena_cfg->max_dead_bytes_per_chunk = kvp.second.cast<int>();
+          } else if (key == "initial_growth_chunk_size_bytes") {
+            ort_arena_cfg->initial_growth_chunk_size_bytes = kvp.second.cast<int>();
+          } else {
+            ORT_THROW("Invalid OrtArenaCfg option: ", key);
+          }
+        }
+        return ort_arena_cfg;
+      }))
      .def_readwrite("max_mem", &OrtArenaCfg::max_mem)
      .def_readwrite("arena_extend_strategy", &OrtArenaCfg::arena_extend_strategy)
      .def_readwrite("initial_chunk_size_bytes", &OrtArenaCfg::initial_chunk_size_bytes)
@ -1135,7 +1139,7 @@ void addObjectMethods(py::module& m, Environment& env, ExecutionProviderRegistra
          "enable_cpu_mem_arena",
          [](const PySessionOptions* options) -> bool { return options->value.enable_cpu_mem_arena; },
          [](PySessionOptions* options, bool enable_cpu_mem_arena) -> void {
-              options->value.enable_cpu_mem_arena = enable_cpu_mem_arena;
+            options->value.enable_cpu_mem_arena = enable_cpu_mem_arena;
          },
          R"pbdoc(Enables the memory arena on CPU. Arena may pre-allocate memory for future usage.
 Set this option to false if you don't want it. Default is True.)pbdoc")
@ -1143,13 +1147,13 @@ Set this option to false if you don't want it. Default is True.)pbdoc")
          "enable_profiling",
          [](const PySessionOptions* options) -> bool { return options->value.enable_profiling; },
          [](PySessionOptions* options, bool enable_profiling) -> void {
-              options->value.enable_profiling = enable_profiling;
+            options->value.enable_profiling = enable_profiling;
          },
          R"pbdoc(Enable profiling for this session. Default is false.)pbdoc")
      .def_property(
          "profile_file_prefix",
          [](const PySessionOptions* options) -> std::basic_string<ORTCHAR_T> {
-              return options->value.profile_file_prefix;
+            return options->value.profile_file_prefix;
          },
          [](PySessionOptions* options, std::basic_string<ORTCHAR_T> profile_file_prefix) -> void {
            options->value.profile_file_prefix = std::move(profile_file_prefix);
@ -1175,14 +1179,14 @@ Serialized model format will default to ONNX unless:
          "enable_mem_pattern",
          [](const PySessionOptions* options) -> bool { return options->value.enable_mem_pattern; },
          [](PySessionOptions* options, bool enable_mem_pattern) -> void {
-              options->value.enable_mem_pattern = enable_mem_pattern;
+            options->value.enable_mem_pattern = enable_mem_pattern;
          },
          R"pbdoc(Enable the memory pattern optimization. Default is true.)pbdoc")
      .def_property(
          "enable_mem_reuse",
          [](const PySessionOptions* options) -> bool { return options->value.enable_mem_reuse; },
          [](PySessionOptions* options, bool enable_mem_reuse) -> void {
-              options->value.enable_mem_reuse = enable_mem_reuse;
+            options->value.enable_mem_reuse = enable_mem_reuse;
          },
          R"pbdoc(Enable the memory reuse optimization. Default is true.)pbdoc")
      .def_property(
@ -1198,7 +1202,7 @@ Serialized model format will default to ONNX unless:
          "log_severity_level",
          [](const PySessionOptions* options) -> int { return options->value.session_log_severity_level; },
          [](PySessionOptions* options, int log_severity_level) -> void {
-              options->value.session_log_severity_level = log_severity_level;
+            options->value.session_log_severity_level = log_severity_level;
          },
          R"pbdoc(Log severity level. Applies to session load, initialization, etc.
 0:Verbose, 1:Info, 2:Warning. 3:Error, 4:Fatal. Default is 2.)pbdoc")
@ -1224,7 +1228,7 @@ Applies to session load, initialization, etc. Default is 0.)pbdoc")
          "execution_mode",
          [](const PySessionOptions* options) -> ExecutionMode { return options->value.execution_mode; },
          [](PySessionOptions* options, ExecutionMode execution_mode) -> void {
-              options->value.execution_mode = execution_mode;
+            options->value.execution_mode = execution_mode;
          },
          R"pbdoc(Sets the execution mode. Default is sequential.)pbdoc")
      .def_property(
@ -1280,7 +1284,7 @@ Applies to session load, initialization, etc. Default is 0.)pbdoc")
          "use_deterministic_compute",
          [](const PySessionOptions* options) -> bool { return options->value.use_deterministic_compute; },
          [](PySessionOptions* options, bool use_deterministic_compute) -> void {
-              options->value.use_deterministic_compute = use_deterministic_compute;
+            options->value.use_deterministic_compute = use_deterministic_compute;
          },
          R"pbdoc(Whether to use deterministic compute. Default is false.)pbdoc")
      .def(
@ -1477,15 +1481,16 @@ including arg name, arg type (contains both type and shape).)pbdoc")
  py::class_<PyInferenceSession>(m, "InferenceSession", R"pbdoc(This is the main class used to run a model.)pbdoc")
      // In Python3, a Python bytes object will be passed to C++ functions that accept std::string or char*
      // without any conversion. So this init method can be used for model file path (string) and model content (bytes)
-      .def(py::init([&env](const PySessionOptions& so, const std::string arg, bool is_arg_file_name,
-                           bool load_config_from_model = false) {
+      .def(py::init([](const PySessionOptions& so, const std::string arg, bool is_arg_file_name,
+                       bool load_config_from_model = false) {
+        auto env = GetEnv();
        std::unique_ptr<PyInferenceSession> sess;

        // separate creation of the session from model loading unless we have to read the config from the model.
        // in a minimal build we only support load via Load(...) and not at session creation time
        if (load_config_from_model) {
 #if !defined(ORT_MINIMAL_BUILD)
-          sess = std::make_unique<PyInferenceSession>(env, so, arg, is_arg_file_name);
+          sess = std::make_unique<PyInferenceSession>(std::move(env), so, arg, is_arg_file_name);

          RegisterCustomOpDomains(sess.get(), so);

@ -1494,7 +1499,7 @@ including arg name, arg type (contains both type and shape).)pbdoc")
          ORT_THROW("Loading configuration from an ONNX model is not supported in this build.");
 #endif
        } else {
-          sess = std::make_unique<PyInferenceSession>(env, so);
+          sess = std::make_unique<PyInferenceSession>(std::move(env), so);
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS)
          RegisterCustomOpDomains(sess.get(), so);
 #endif
@ -1600,21 +1605,10 @@ including arg name, arg type (contains both type and shape).)pbdoc")
        }
        return fetches;
      })
-      .def("run_with_ortvaluevector", [](
-        PyInferenceSession* sess,
-        RunOptions run_options,
-        const std::vector<std::string>& feed_names,
-        const std::vector<OrtValue>& feeds,
-        const std::vector<std::string>& fetch_names,
-        std::vector<OrtValue>& fetches,
-        const std::vector<OrtDevice>& fetch_devices) -> void {
-
-        {
-          // release GIL to allow multiple python threads to invoke Run() in parallel.
-          py::gil_scoped_release release;
-          OrtPybindThrowIfError(sess->GetSessionHandle()->Run(run_options, feed_names, feeds, fetch_names, &fetches, &fetch_devices));
-        }
-
+      .def("run_with_ortvaluevector", [](PyInferenceSession* sess, RunOptions run_options, const std::vector<std::string>& feed_names, const std::vector<OrtValue>& feeds, const std::vector<std::string>& fetch_names, std::vector<OrtValue>& fetches, const std::vector<OrtDevice>& fetch_devices) -> void {
+        // release GIL to allow multiple python threads to invoke Run() in parallel.
+        py::gil_scoped_release release;
+        OrtPybindThrowIfError(sess->GetSessionHandle()->Run(run_options, feed_names, feeds, fetch_names, &fetches, &fetch_devices));
      })
      .def("end_profiling", [](const PyInferenceSession* sess) -> std::string {
        return sess->GetSessionHandle()->EndProfiling();
@ -1691,27 +1685,27 @@ including arg name, arg type (contains both type and shape).)pbdoc")

        return ret;
 #else
-        ORT_UNUSED_PARAMETER(sess);
-        ORT_THROW("TunableOp and get_tuning_results are not supported in this build.");
+            ORT_UNUSED_PARAMETER(sess);
+            ORT_THROW("TunableOp and get_tuning_results are not supported in this build.");
 #endif
      })
      .def("set_tuning_results", [](PyInferenceSession* sess, py::list results, bool error_on_invalid) -> void {
 #if !defined(ORT_MINIMAL_BUILD)
        std::vector<TuningResults> tuning_results;
-        for (auto handle: results) {
+        for (auto handle : results) {
          auto py_trs = handle.cast<py::dict>();
          TuningResults trs;
          trs.ep = py_trs["ep"].cast<py::str>();

-          for (const auto [py_op_sig, py_kernel_map]: py_trs["results"].cast<py::dict>()) {
+          for (const auto [py_op_sig, py_kernel_map] : py_trs["results"].cast<py::dict>()) {
            KernelMap kernel_map;
-            for (const auto [py_params_sig, py_kernel_id]: py_kernel_map.cast<py::dict>()) {
+            for (const auto [py_params_sig, py_kernel_id] : py_kernel_map.cast<py::dict>()) {
              kernel_map[py_params_sig.cast<py::str>()] = py_kernel_id.cast<py::int_>();
            }
            trs.results[py_op_sig.cast<py::str>()] = kernel_map;
          }

-          for (const auto [k, v]: py_trs["validators"].cast<py::dict>()) {
+          for (const auto [k, v] : py_trs["validators"].cast<py::dict>()) {
            trs.validators[k.cast<py::str>()] = v.cast<py::str>();
          }

@ -1723,10 +1717,10 @@ including arg name, arg type (contains both type and shape).)pbdoc")
          throw std::runtime_error("Error in execution: " + status.ErrorMessage());
        }
 #else
-        ORT_UNUSED_PARAMETER(sess);
-        ORT_UNUSED_PARAMETER(results);
-        ORT_UNUSED_PARAMETER(error_on_invalid);
-        ORT_THROW("TunableOp and set_tuning_results are not supported in this build.");
+            ORT_UNUSED_PARAMETER(sess);
+            ORT_UNUSED_PARAMETER(results);
+            ORT_UNUSED_PARAMETER(error_on_invalid);
+            ORT_THROW("TunableOp and set_tuning_results are not supported in this build.");
 #endif
      });

@ -1746,10 +1740,10 @@ void CreateInferencePybindStateModule(py::module& m) {
    import_array1();
  })();

-  Environment& env = GetEnv();
+  auto env = GetEnv();

-  addGlobalMethods(m, env);
-  addObjectMethods(m, env, RegisterExecutionProviders);
+  addGlobalMethods(m);
+  addObjectMethods(m, RegisterExecutionProviders);
  addOrtValueMethods(m);
  addSparseTensorMethods(m);
  addIoBindingMethods(m);
@ -1775,34 +1769,77 @@ void InitArray() {
  })();
 }

-// static variable used to create inference session and training session.
-static std::unique_ptr<Environment> session_env;
+namespace {
+// This class provides a static shell for on-demand and thread-safe construction
+// of Environment object for both Inference and Training python layers.
+// Environment class contains objects such as default logger, that must be available
+// for the entire duration of a program that makes use of onnxruntime library.
+// Because Python is a garbage collected language and the order of destruction of objects
+// is not guaranteed we design this class with the following important features.

-void InitializeEnv() {
-  auto initialize = [&]() {
+// 1) we make this class a singleton that is a function local static. The function local statics
+//    are constructed when the function is called the very first time. This fact has several important
+//    properties.
+//    - First, it is constructed before it is first needed possibly by another static object
+//      and destroyed after that object is destroyed.
+//    - Second, it is constructed in a thread safe manner.
+//    - Last, this order of construction/destruction is enforced across the compilation units, as opposed
+//      to the static objects that are simply declared in order in a single unit, but their lifespan is
+//      unconnected to that of in other compilation units. This is achieved automatically by run-time
+//      by execution atexit() to build a chain.
+//  2) We make Environment owned by a shared_ptr. This is done because python objects such as Inference and Training
+//    sessions depend on this global. We acquire a shared_ptr instance when those objects are instantiated
+//    and release it automatically when they are garbage collected. Although with this change all of the
+//    globals seem to have been destroyed after module is unloaded and GC runs before that, it is cheap and gives
+//    a piece of mind as there were situations when GC was still running in the past after Env was gone.
+//    TrainingEnv global also holds shared reference to this global.
+// 3) We guard against singleton resurrection attempts to detect code runs that when it should
+//    not and make necessary adjustments.
+//    For all the related details and why it is needed see "Modern C++ design" by A. Alexandrescu Chapter 6.
+class EnvInitializer {
+ public:
+  static std::shared_ptr<onnxruntime::Environment> SharedInstance() {
+    // Guard against attempts to resurrect the singleton
+    if (EnvInitializer::destroyed) {
+      ORT_THROW("Detected an attempt to resurrect destroyed Environment");
+    }
+    static EnvInitializer env_holder;
+    return env_holder.Get();
+  }
+
+ private:
+  EnvInitializer() {
    // Initialization of the module
    InitArray();
+    std::unique_ptr<Environment> env_ptr;
    Env::Default().GetTelemetryProvider().SetLanguageProjection(OrtLanguageProjection::ORT_PROJECTION_PYTHON);
    OrtPybindThrowIfError(Environment::Create(std::make_unique<LoggingManager>(
                                                  std::make_unique<CLogSink>(),
                                                  Severity::kWARNING, false, LoggingManager::InstanceType::Default,
                                                  &SessionObjectInitializer::default_logger_id),
-                                              session_env));
-
-    static bool initialized = false;
-    if (initialized) {
-      return;
-    }
-    initialized = true;
-  };
-  initialize();
-}
-
-onnxruntime::Environment& GetEnv() {
-  if (!session_env) {
-    InitializeEnv();
+                                              env_ptr));
+    session_env_ = std::shared_ptr<Environment>(env_ptr.release());
+    destroyed = false;
  }
-  return *session_env;
+
+  ~EnvInitializer() {
+    destroyed = true;
+  }
+
+  std::shared_ptr<Environment> Get() const {
+    return session_env_;
+  }
+
+  std::shared_ptr<Environment> session_env_;
+
+  static bool destroyed;
+};
+
+bool EnvInitializer::destroyed = false;
+}  // namespace
+
+std::shared_ptr<onnxruntime::Environment> GetEnv() {
+  return EnvInitializer::SharedInstance();
 }

 }  // namespace python
--- a/onnxruntime/python/onnxruntime_pybind_state_common.h
+++ b/onnxruntime/python/onnxruntime_pybind_state_common.h
@ -232,33 +232,36 @@ using PySessionOptions = OrtSessionOptions;

 // Thin wrapper over internal C++ InferenceSession to accommodate custom op library management for the Python user
 struct PyInferenceSession {
-  PyInferenceSession(Environment& env, const PySessionOptions& so) {
-    sess_ = std::make_unique<InferenceSession>(so.value, env);
+  PyInferenceSession(std::shared_ptr<Environment> env, const PySessionOptions& so)
+  : env_(std::move(env)) {
+    sess_ = std::make_unique<InferenceSession>(so.value, *env_);
  }

 #if !defined(ORT_MINIMAL_BUILD)
-  PyInferenceSession(Environment& env, const PySessionOptions& so, const std::string& arg, bool is_arg_file_name) {
+  PyInferenceSession(std::shared_ptr<Environment> env, const PySessionOptions& so, const std::string& arg, bool is_arg_file_name) 
+  : env_(std::move(env)) {
    if (is_arg_file_name) {
      // Given arg is the file path. Invoke the corresponding ctor().
-      sess_ = std::make_unique<InferenceSession>(so.value, env, arg);
+      sess_ = std::make_unique<InferenceSession>(so.value, *env_, arg);
    } else {
      // Given arg is the model content as bytes. Invoke the corresponding ctor().
      std::istringstream buffer(arg);
-      sess_ = std::make_unique<InferenceSession>(so.value, env, buffer);
+      sess_ = std::make_unique<InferenceSession>(so.value, *env_, buffer);
    }
  }
 #endif

  InferenceSession* GetSessionHandle() const { return sess_.get(); }

-  virtual ~PyInferenceSession() {}
+  virtual ~PyInferenceSession() = default;

 protected:
-  PyInferenceSession(std::unique_ptr<InferenceSession> sess) {
-    sess_ = std::move(sess);
+  PyInferenceSession(std::shared_ptr<Environment> env, std::unique_ptr<InferenceSession> sess) 
+  : env_(std::move(env)), sess_(std::move(sess)) {
  }

 private:
+  std::shared_ptr<Environment> env_;
  std::unique_ptr<InferenceSession> sess_;
 };

@ -383,7 +386,7 @@ class SessionObjectInitializer {
 #if defined(_MSC_VER) && !defined(__clang__)
 #pragma warning(pop)
 #endif
-Environment& GetEnv();
+std::shared_ptr<Environment> GetEnv();

 // Initialize an InferenceSession.
 // Any provider_options should have entries in matching order to provider_types.
--- a/orttraining/orttraining/lazy_tensor/accelerator.cc
+++ b/orttraining/orttraining/lazy_tensor/accelerator.cc
@ -26,10 +26,6 @@

 namespace onnxruntime {

-namespace python {
-Environment& GetTrainingORTEnv();
-}
-
 namespace lazytensor {

 namespace py = pybind11;
@ -304,11 +300,11 @@ static std::unique_ptr<onnxruntime::InferenceSession> CreateSession() {
 #ifdef USE_CUDA
  NvtxRange range(__func__);
 #endif
-  // Environment shared by all sessions.
-  static onnxruntime::Environment& pybind_default_env = onnxruntime::python::GetTrainingORTEnv();
  // All sessions use the same config.
  static onnxruntime::SessionOptions sess_opts;
-  return std::make_unique<onnxruntime::InferenceSession>(sess_opts, pybind_default_env);
+  // Query the singleton always, to make sure we detect shutdown
+  auto ort_env = onnxruntime::python::GetEnv();
+  return std::make_unique<onnxruntime::InferenceSession>(sess_opts, *ort_env);
 }

 static OrtDevice CheckAndGetTensorDevice(const at::ArrayRef<c10::IValue>& values) {
--- a/orttraining/orttraining/python/orttraining_pybind_common.h
+++ b/orttraining/orttraining/python/orttraining_pybind_common.h
@ -22,7 +22,7 @@ class ORTTrainingPythonEnv {
 public:
  ORTTrainingPythonEnv();

-  Environment& GetORTEnv();
+  std::shared_ptr<Environment> GetORTEnv() const;

  std::shared_ptr<IExecutionProvider> GetExecutionProviderInstance(const std::string& provider_type,
                                                                   size_t hash);
@ -45,7 +45,7 @@ class ORTTrainingPythonEnv {
  std::string GetExecutionProviderMapKey(const std::string& provider_type,
                                         size_t hash);

-  std::unique_ptr<Environment> ort_env_;
+  std::shared_ptr<Environment> ort_env_;
  ExecutionProviderMap execution_provider_instances_map_;
  std::vector<std::string> available_training_eps_;
 };
--- a/orttraining/orttraining/python/orttraining_pybind_state.cc
+++ b/orttraining/orttraining/python/orttraining_pybind_state.cc
@ -49,7 +49,6 @@ using namespace onnxruntime;
 using namespace onnxruntime::logging;
 using namespace onnxruntime::training;

-Environment& GetTrainingORTEnv();
 ORTTrainingPythonEnv& GetTrainingEnv();

 void ResolveExtraProviderOptions(const std::vector<std::string>& provider_types,
@ -169,9 +168,13 @@ struct TrainingConfigurationResult {
 struct PyOptimizer {
  PyOptimizer(const std::string optimizer_model_uri,
              onnxruntime::training::api::Module* model, std::vector<std::shared_ptr<IExecutionProvider>> provider)
-      : optimizer_(std::make_unique<onnxruntime::training::api::Optimizer>(optimizer_model_uri,
-                                                                           model->NamedParameters(), onnxruntime::SessionOptions(),
-                                                                           GetTrainingORTEnv(), provider)) {
+      : optimizer_() {
+
+    auto env = GetTrainingEnv().GetORTEnv();
+    // XXX: We hope that env will be around when optimizer needs it.
+    optimizer_ = std::make_shared<onnxruntime::training::api::Optimizer>(optimizer_model_uri,
+                                                                         model->NamedParameters(), onnxruntime::SessionOptions(),
+                                                                         *env, provider);
  }

  std::shared_ptr<onnxruntime::training::api::Optimizer> optimizer_;
@ -549,20 +552,21 @@ void addObjectMethodsForTraining(py::module& m, ExecutionProviderRegistrationFn

  // Thin wrapper over internal C++ InferenceSession to accommodate custom op library management for the Python user
  struct PyTrainingSession : public PyInferenceSession {
-    PyTrainingSession(Environment& env, const PySessionOptions& so)
-        : PyInferenceSession(std::make_unique<PipelineTrainingSession>(so.value, env)) {
+    PyTrainingSession(std::shared_ptr<Environment> env, const PySessionOptions& so)
+        : PyInferenceSession(env, std::make_unique<PipelineTrainingSession>(so.value, *env)) {
    }
+    ~PyTrainingSession() = default;
  };

  py::class_<PyTrainingSession, PyInferenceSession> training_session(m, "TrainingSession");
  training_session
      .def(py::init([](const PySessionOptions& so) {
-        Environment& env = GetTrainingORTEnv();
-        return std::make_unique<PyTrainingSession>(env, so);
+        auto& training_env = GetTrainingEnv();
+        return std::make_unique<PyTrainingSession>(training_env.GetORTEnv(), so);
      }))
      .def(py::init([]() {
-        Environment& env = GetTrainingORTEnv();
-        return std::make_unique<PyTrainingSession>(env, GetDefaultCPUSessionOptions());
+        auto& training_env = GetTrainingEnv();
+        return std::make_unique<PyTrainingSession>(training_env.GetORTEnv(), GetDefaultCPUSessionOptions());
      }))
      .def("finalize", [](py::object) {
 #if defined(USE_MPI)
@ -876,10 +880,11 @@ void addObjectMethodsForTraining(py::module& m, ExecutionProviderRegistrationFn
        onnxruntime::SessionOptions session_option;
        std::vector<std::shared_ptr<IExecutionProvider>> provider = GetExecutionProvidersForTrainingApis(device);

+        auto env = GetTrainingEnv().GetORTEnv();
        return std::make_unique<onnxruntime::training::api::Module>(
            model_uri,
            state.module_checkpoint_state.named_parameters, session_option,
-            GetTrainingORTEnv(), provider, eval_model_uri);
+            *env, provider, eval_model_uri);
      }))
      .def("train_step",
           [](onnxruntime::training::api::Module* model,
--- a/orttraining/orttraining/python/orttraining_python_module.cc
+++ b/orttraining/orttraining/python/orttraining_python_module.cc
@ -36,8 +36,8 @@ const ROCMExecutionProviderInfo GetRocmExecutionProviderInfo(ProviderInfo_ROCM*
                                                             const ProviderOptionsMap& provider_options_map);
 #endif

-void addGlobalMethods(py::module& m, Environment& env);
-void addObjectMethods(py::module& m, Environment& env, ExecutionProviderRegistrationFn ep_registration_fn);
+void addGlobalMethods(py::module& m);
+void addObjectMethods(py::module& m, ExecutionProviderRegistrationFn ep_registration_fn);
 void addObjectMethodsForTraining(py::module& m, ExecutionProviderRegistrationFn ep_registration_fn);
 void addObjectMethodsForEager(py::module& m);
 #ifdef ENABLE_LAZY_TENSOR
@ -133,18 +133,13 @@ bool GetProviderInstanceHash(const std::string& type,
  return false;
 }

-ORTTrainingPythonEnv::ORTTrainingPythonEnv() {
-  OrtPybindThrowIfError(Environment::Create(std::make_unique<LoggingManager>(
-                                                std::make_unique<CLogSink>(),
-                                                Severity::kWARNING, false, LoggingManager::InstanceType::Default,
-                                                &SessionObjectInitializer::default_logger_id),
-                                            ort_env_));
-  auto& builtinEPs = GetAvailableExecutionProviderNames();
+ORTTrainingPythonEnv::ORTTrainingPythonEnv() : ort_env_(GetEnv()) {
+  const auto& builtinEPs = GetAvailableExecutionProviderNames();
  available_training_eps_.assign(builtinEPs.begin(), builtinEPs.end());
 }

-Environment& ORTTrainingPythonEnv::GetORTEnv() {
-  return *ort_env_;
+std::shared_ptr<Environment> ORTTrainingPythonEnv::GetORTEnv() const {
+  return ort_env_;
 }

 std::shared_ptr<IExecutionProvider> ORTTrainingPythonEnv::GetExecutionProviderInstance(const std::string& provider_type,
@ -183,37 +178,82 @@ void ORTTrainingPythonEnv::ClearExecutionProviderInstances() {
  execution_provider_instances_map_.clear();
 }

-static std::unique_ptr<ORTTrainingPythonEnv> ort_training_env;
+namespace {

-void InitializeTrainingEnv() {
-  auto initialize = [&]() {
-    static bool initialized = false;
-    if (initialized) {
-      return;
+// This class provides a static shell for on-demand and thread-safe construction
+// of ORTTrainingPythonEnv object for both Inference and Training python layers.
+// ORTTrainingPythonEnv class contains instances of execution providers that have been
+// instantiated for training purposes. It depends on the Environment singleton to which it
+// holds a shared_ptr instance.
+//
+// 1) we make this class a singleton that is a function local static. The function local statics
+//    are constructed when the function is called the very first time. This fact has several important
+//    properties. 
+//    - First, it is constructed before it is first needed possibly by another static object
+//      and destroyed after that object is destroyed.
+//    - Second, it is constructed in a thread safe manner.
+//    - Last, this order of construction/destruction is enforced across the compilation units, as opposed
+//      to the static objects that are simply declared in order in a single unit, but their lifespan is 
+//      unconnected to that of in other compilation units. This is achieved automatically by run-time
+//      by execution atexit() to build a chain.
+// 2) This ORTTrainingPythonEnv is currently owned by a unique_ptr unlike the Environment singleton. This is
+//    because we currently do not see a need to refer to it by any of the Python objects or by other singletons.
+//    With this change this singleton is properly destroyed after python module is unloaded, but before the Environment.
+//    HOWEVER, because it holds instances of execution providers, we want to make sure that those instances are destroyed
+//    before those depended EP DLLs are unloaded so EP destructor can run.
+//    This static is destroyed when this compilation unit is unloaded and it generally happens
+//    AFTER EP dlls are unloaded. To mitigate that, we clear EP instances using python `atexit` (different from C atexit())
+//    mechanism which takes place after all python objects are GCed but before any DLLs are unloaded or
+//    runtime starts destroying globals.
+// 3) We guard against singleton resurrection attempts to detect code that runs when it should not
+//    and make necessary adjustments.
+//    For all the related details and why it is needed see "Modern C++ design" by A. Alexandrescu Chapter 6.
+class TrainingEnvInitialzer {
+ public:
+
+  static ORTTrainingPythonEnv& Instance() {
+    // Guard against attempts to resurrect the singleton
+    if (TrainingEnvInitialzer::destroyed) {
+      ORT_THROW("Detected an attempt to resurrect destroyed Training Environment");
    }
-    // Initialization of the module
+
+    static TrainingEnvInitialzer training_env_holder;
+
+    return training_env_holder.Get();
+  }
+
+ private:
+
+  TrainingEnvInitialzer() {
    InitArray();
    Env::Default().GetTelemetryProvider().SetLanguageProjection(OrtLanguageProjection::ORT_PROJECTION_PYTHON);
-    ort_training_env = std::make_unique<ORTTrainingPythonEnv>();
-    initialized = true;
-  };
-  initialize();
-}
+    ort_training_env_ = std::make_unique<ORTTrainingPythonEnv>();
+  }
+
+  ~TrainingEnvInitialzer() {
+    destroyed = true;
+  }
+
+  ORTTrainingPythonEnv& Get() noexcept {
+    return *ort_training_env_;
+  }
+
+  std::unique_ptr<ORTTrainingPythonEnv> ort_training_env_;
+
+  static bool destroyed;
+};
+
+bool TrainingEnvInitialzer::destroyed = false;
+
+}  // namespace

 ORTTrainingPythonEnv& GetTrainingEnv() {
-  if (!ort_training_env) {
-    InitializeTrainingEnv();
-  }
-  return *ort_training_env;
-}
-
-Environment& GetTrainingORTEnv() {
-  if (!ort_training_env) {
-    InitializeTrainingEnv();
-  }
-  return ort_training_env->GetORTEnv();
+  return TrainingEnvInitialzer::Instance();
 }

+// TODO: If this global has a conflicting lifespan with other globals
+// such as Environment, follow the global objects management pattern for
+// Environment and ORTTrainingPythonEnv
 #ifdef ENABLE_EAGER_MODE
 using namespace torch_ort::eager;
 static std::unique_ptr<ORTBackendsManager> ort_backends_manager_instance;
@ -225,8 +265,9 @@ void InitializeBackendsManager() {
      return;
    }
    // Initialization of the module
-    auto& env = onnxruntime::python::GetTrainingORTEnv();
-    ort_backends_manager_instance = std::make_unique<ORTBackendsManager>(env.GetLoggingManager()->DefaultLogger());
+    auto& training_env = onnxruntime::python::GetTrainingEnv();
+    auto env = training_env.GetORTEnv();
+    ort_backends_manager_instance = std::make_unique<ORTBackendsManager>(env->GetLoggingManager()->DefaultLogger());
    initialized = true;
  };
  initialize();
@ -247,7 +288,7 @@ void ResolveExtraProviderOptions(const std::vector<std::string>& provider_types,
  for (auto& provider_type : provider_types) {
    auto it = training_env.ext_execution_provider_info_map_.find(provider_type);
    if (it == training_env.ext_execution_provider_info_map_.end()) {
-      //nothing changed.
+      // nothing changed.
      if (original_provider_options_map.find(provider_type) != original_provider_options_map.end())
        merged_options.insert({provider_type, original_provider_options_map.at(provider_type)});
    } else {
@ -318,9 +359,10 @@ PYBIND11_MODULE(onnxruntime_pybind11_state, m) {
  m.doc() = "pybind11 stateful interface to ORTTraining";
  RegisterExceptions(m);

-  Environment& env = GetTrainingORTEnv();
-  addGlobalMethods(m, env);
-  addObjectMethods(m, env, ORTTrainingRegisterExecutionProviders);
+  // Instantiate singletons
+  GetTrainingEnv();
+  addGlobalMethods(m);
+  addObjectMethods(m, ORTTrainingRegisterExecutionProviders);
  addOrtValueMethods(m);
  addSparseTensorMethods(m);
  addIoBindingMethods(m);
@ -357,19 +399,20 @@ PYBIND11_MODULE(onnxruntime_pybind11_state, m) {

  m.def("get_version_string", []() -> std::string { return ORT_VERSION; });

-  m.def("clear_training_ep_instances", []() -> void {
-    ort_training_env->ClearExecutionProviderInstances();
-  },
-        "Clean the execution provider instances used in ort training module.");
+  m.def(
+      "clear_training_ep_instances", []() -> void {
+        GetTrainingEnv().ClearExecutionProviderInstances();
+      },
+      "Clean the execution provider instances used in ort training module.");

-  // clean the ort training environment when python interpreter exit
-  // otherwise the global var will be de-constrcut after user main.
-  // the order of ort training environment deconstruction and cudart
-  // deconstruction is not stable, which will lead to crash.
+  // See documentation for class TrainingEnvInitialzer earlier in this module
+  // for an explanation as to why this is needed.
  auto atexit = py::module_::import("atexit");
  atexit.attr("register")(py::cpp_function([]() {
-    ort_training_env = nullptr;
+    GetTrainingEnv().ClearExecutionProviderInstances();
 #ifdef ENABLE_EAGER_MODE
+    // This singleton should also be re-factored into a function local static
+    // so its lifetime is properly managed.
    ort_backends_manager_instance = nullptr;
 #endif
  }));