diff --git a/onnxruntime/core/platform/windows/debug_alloc.cc b/onnxruntime/core/platform/windows/debug_alloc.cc
index 4e993ace4e..495e54e0a3 100644
--- a/onnxruntime/core/platform/windows/debug_alloc.cc
+++ b/onnxruntime/core/platform/windows/debug_alloc.cc
@@ -224,7 +224,8 @@ Memory_LeakCheck::~Memory_LeakCheck() {
         string.find("re2::RE2::Init") == std::string::npos &&
         string.find("testing::internal::Mutex::ThreadSafeLazyInit") == std::string::npos &&
         string.find("testing::internal::ThreadLocalRegistryImpl::GetThreadLocalsMapLocked") == std::string::npos &&
-        string.find("testing::internal::ThreadLocalRegistryImpl::GetValueOnCurrentThread") == std::string::npos) {
+        string.find("testing::internal::ThreadLocalRegistryImpl::GetValueOnCurrentThread") == std::string::npos &&
+        string.find("PyInit_onnxruntime_pybind11_state") == std::string::npos){
       if (leaked_bytes == 0)
         DebugPrint("\n-----Starting Heap Trace-----\n\n");
 
diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc
index 7a2aad88a1..9f1e098fd7 100644
--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@@ -612,7 +612,7 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
             ORT_THROW("Invalid value passed for enable_vpu_fast_compile: ", option.second);
           }
 
-        }  else if (option.first == "enable_opencl_throttling") {
+        } else if (option.first == "enable_opencl_throttling") {
           if (option.second == "True") {
             params.enable_opencl_throttling = true;
           } else if (option.second == "False") {
@@ -768,7 +768,8 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
 #ifdef USE_QNN
     auto cit = provider_options_map.find(type);
     return onnxruntime::QNNProviderFactoryCreator::Create(
-               cit == provider_options_map.end() ? ProviderOptions{} : cit->second)->CreateProvider();
+               cit == provider_options_map.end() ? ProviderOptions{} : cit->second)
+        ->CreateProvider();
 #endif
   } else {
     // check whether it is a dynamic load EP:
@@ -901,7 +902,7 @@ static void LogDeprecationWarning(
 }
 #endif
 
-void addGlobalMethods(py::module& m, Environment& env) {
+void addGlobalMethods(py::module& m) {
   m.def("get_default_session_options", &GetDefaultCPUSessionOptions, "Return a default session_options instance.");
   m.def("get_session_initializer", &SessionObjectInitializer::Get, "Return a default session object initializer.");
   m.def(
@@ -911,16 +912,18 @@ void addGlobalMethods(py::module& m, Environment& env) {
       "set_seed", [](const int64_t seed) { utils::SetRandomSeed(seed); },
       "Sets the seed used for random number generation in Onnxruntime.");
   m.def(
-      "set_default_logger_severity", [&env](int severity) {
+      "set_default_logger_severity", [](int severity) {
         ORT_ENFORCE(severity >= 0 && severity <= 4,
                     "Invalid logging severity. 0:Verbose, 1:Info, 2:Warning, 3:Error, 4:Fatal");
-        logging::LoggingManager* default_logging_manager = env.GetLoggingManager();
+        auto env = GetEnv();
+        logging::LoggingManager* default_logging_manager = env->GetLoggingManager();
         default_logging_manager->SetDefaultLoggerSeverity(static_cast<logging::Severity>(severity));
       },
       "Sets the default logging severity. 0:Verbose, 1:Info, 2:Warning, 3:Error, 4:Fatal");
   m.def(
-      "set_default_logger_verbosity", [&env](int vlog_level) {
-        logging::LoggingManager* default_logging_manager = env.GetLoggingManager();
+      "set_default_logger_verbosity", [](int vlog_level) {
+        auto env = GetEnv();
+        logging::LoggingManager* default_logging_manager = env->GetLoggingManager();
         default_logging_manager->SetDefaultLoggerVerbosity(vlog_level);
       },
       "Sets the default logging verbosity level. To activate the verbose log, "
@@ -937,8 +940,9 @@ void addGlobalMethods(py::module& m, Environment& env) {
       "disable_telemetry_events", []() -> void { platform_env.GetTelemetryProvider().DisableTelemetryEvents(); },
       "Disables platform-specific telemetry collection.");
   m.def(
-      "create_and_register_allocator", [&env](const OrtMemoryInfo& mem_info, const OrtArenaCfg* arena_cfg = nullptr) -> void {
-        auto st = env.CreateAndRegisterAllocator(mem_info, arena_cfg);
+      "create_and_register_allocator", [](const OrtMemoryInfo& mem_info, const OrtArenaCfg* arena_cfg = nullptr) -> void {
+        auto env = GetEnv();
+        auto st = env->CreateAndRegisterAllocator(mem_info, arena_cfg);
         if (!st.IsOK()) {
           throw std::runtime_error("Error when creating and registering allocator: " + st.ErrorMessage());
         }
@@ -1034,7 +1038,7 @@ void addGlobalMethods(py::module& m, Environment& env) {
 #endif
 }
 
-void addObjectMethods(py::module& m, Environment& env, ExecutionProviderRegistrationFn ep_registration_fn) {
+void addObjectMethods(py::module& m, ExecutionProviderRegistrationFn ep_registration_fn) {
   py::enum_<GraphOptimizationLevel>(m, "GraphOptimizationLevel")
       .value("ORT_DISABLE_ALL", GraphOptimizationLevel::ORT_DISABLE_ALL)
       .value("ORT_ENABLE_BASIC", GraphOptimizationLevel::ORT_ENABLE_BASIC)
@@ -1077,33 +1081,33 @@ void addObjectMethods(py::module& m, Environment& env, ExecutionProviderRegistra
   // See docs/C_API.md for details on what the following parameters mean and how to choose these values
   ort_arena_cfg_binding.def(py::init([](size_t max_mem, int arena_extend_strategy_local,
                                         int initial_chunk_size_bytes, int max_dead_bytes_per_chunk) {
-    auto ort_arena_cfg = std::make_unique<OrtArenaCfg>();
-    ort_arena_cfg->max_mem = max_mem;
-    ort_arena_cfg->arena_extend_strategy = arena_extend_strategy_local;
-    ort_arena_cfg->initial_chunk_size_bytes = initial_chunk_size_bytes;
-    ort_arena_cfg->max_dead_bytes_per_chunk = max_dead_bytes_per_chunk;
-    return ort_arena_cfg;
-  }))
+                         auto ort_arena_cfg = std::make_unique<OrtArenaCfg>();
+                         ort_arena_cfg->max_mem = max_mem;
+                         ort_arena_cfg->arena_extend_strategy = arena_extend_strategy_local;
+                         ort_arena_cfg->initial_chunk_size_bytes = initial_chunk_size_bytes;
+                         ort_arena_cfg->max_dead_bytes_per_chunk = max_dead_bytes_per_chunk;
+                         return ort_arena_cfg;
+                       }))
       .def(py::init([](const py::dict& feeds) {
-    auto ort_arena_cfg = std::make_unique<OrtArenaCfg>();
-    for (const auto kvp : feeds) {
-      std::string key = kvp.first.cast<std::string>();
-      if (key == "max_mem") {
-        ort_arena_cfg->max_mem = kvp.second.cast<size_t>();
-      } else if (key == "arena_extend_strategy") {
-        ort_arena_cfg->arena_extend_strategy = kvp.second.cast<int>();
-      } else if (key == "initial_chunk_size_bytes") {
-        ort_arena_cfg->initial_chunk_size_bytes = kvp.second.cast<int>();
-      } else if (key == "max_dead_bytes_per_chunk") {
-        ort_arena_cfg->max_dead_bytes_per_chunk = kvp.second.cast<int>();
-      } else if (key == "initial_growth_chunk_size_bytes") {
-        ort_arena_cfg->initial_growth_chunk_size_bytes = kvp.second.cast<int>();
-        } else {
-        ORT_THROW("Invalid OrtArenaCfg option: ", key);
-      }
-    }
-    return ort_arena_cfg;
-  }))
+        auto ort_arena_cfg = std::make_unique<OrtArenaCfg>();
+        for (const auto kvp : feeds) {
+          std::string key = kvp.first.cast<std::string>();
+          if (key == "max_mem") {
+            ort_arena_cfg->max_mem = kvp.second.cast<size_t>();
+          } else if (key == "arena_extend_strategy") {
+            ort_arena_cfg->arena_extend_strategy = kvp.second.cast<int>();
+          } else if (key == "initial_chunk_size_bytes") {
+            ort_arena_cfg->initial_chunk_size_bytes = kvp.second.cast<int>();
+          } else if (key == "max_dead_bytes_per_chunk") {
+            ort_arena_cfg->max_dead_bytes_per_chunk = kvp.second.cast<int>();
+          } else if (key == "initial_growth_chunk_size_bytes") {
+            ort_arena_cfg->initial_growth_chunk_size_bytes = kvp.second.cast<int>();
+          } else {
+            ORT_THROW("Invalid OrtArenaCfg option: ", key);
+          }
+        }
+        return ort_arena_cfg;
+      }))
       .def_readwrite("max_mem", &OrtArenaCfg::max_mem)
       .def_readwrite("arena_extend_strategy", &OrtArenaCfg::arena_extend_strategy)
       .def_readwrite("initial_chunk_size_bytes", &OrtArenaCfg::initial_chunk_size_bytes)
@@ -1135,7 +1139,7 @@ void addObjectMethods(py::module& m, Environment& env, ExecutionProviderRegistra
           "enable_cpu_mem_arena",
           [](const PySessionOptions* options) -> bool { return options->value.enable_cpu_mem_arena; },
           [](PySessionOptions* options, bool enable_cpu_mem_arena) -> void {
-              options->value.enable_cpu_mem_arena = enable_cpu_mem_arena;
+            options->value.enable_cpu_mem_arena = enable_cpu_mem_arena;
           },
           R"pbdoc(Enables the memory arena on CPU. Arena may pre-allocate memory for future usage.
 Set this option to false if you don't want it. Default is True.)pbdoc")
@@ -1143,13 +1147,13 @@ Set this option to false if you don't want it. Default is True.)pbdoc")
           "enable_profiling",
           [](const PySessionOptions* options) -> bool { return options->value.enable_profiling; },
           [](PySessionOptions* options, bool enable_profiling) -> void {
-              options->value.enable_profiling = enable_profiling;
+            options->value.enable_profiling = enable_profiling;
           },
           R"pbdoc(Enable profiling for this session. Default is false.)pbdoc")
       .def_property(
           "profile_file_prefix",
           [](const PySessionOptions* options) -> std::basic_string<ORTCHAR_T> {
-              return options->value.profile_file_prefix;
+            return options->value.profile_file_prefix;
           },
           [](PySessionOptions* options, std::basic_string<ORTCHAR_T> profile_file_prefix) -> void {
             options->value.profile_file_prefix = std::move(profile_file_prefix);
@@ -1175,14 +1179,14 @@ Serialized model format will default to ONNX unless:
           "enable_mem_pattern",
           [](const PySessionOptions* options) -> bool { return options->value.enable_mem_pattern; },
           [](PySessionOptions* options, bool enable_mem_pattern) -> void {
-              options->value.enable_mem_pattern = enable_mem_pattern;
+            options->value.enable_mem_pattern = enable_mem_pattern;
           },
           R"pbdoc(Enable the memory pattern optimization. Default is true.)pbdoc")
       .def_property(
           "enable_mem_reuse",
           [](const PySessionOptions* options) -> bool { return options->value.enable_mem_reuse; },
           [](PySessionOptions* options, bool enable_mem_reuse) -> void {
-              options->value.enable_mem_reuse = enable_mem_reuse;
+            options->value.enable_mem_reuse = enable_mem_reuse;
           },
           R"pbdoc(Enable the memory reuse optimization. Default is true.)pbdoc")
       .def_property(
@@ -1198,7 +1202,7 @@ Serialized model format will default to ONNX unless:
           "log_severity_level",
           [](const PySessionOptions* options) -> int { return options->value.session_log_severity_level; },
           [](PySessionOptions* options, int log_severity_level) -> void {
-              options->value.session_log_severity_level = log_severity_level;
+            options->value.session_log_severity_level = log_severity_level;
           },
           R"pbdoc(Log severity level. Applies to session load, initialization, etc.
 0:Verbose, 1:Info, 2:Warning. 3:Error, 4:Fatal. Default is 2.)pbdoc")
@@ -1224,7 +1228,7 @@ Applies to session load, initialization, etc. Default is 0.)pbdoc")
           "execution_mode",
           [](const PySessionOptions* options) -> ExecutionMode { return options->value.execution_mode; },
           [](PySessionOptions* options, ExecutionMode execution_mode) -> void {
-              options->value.execution_mode = execution_mode;
+            options->value.execution_mode = execution_mode;
           },
           R"pbdoc(Sets the execution mode. Default is sequential.)pbdoc")
       .def_property(
@@ -1280,7 +1284,7 @@ Applies to session load, initialization, etc. Default is 0.)pbdoc")
           "use_deterministic_compute",
           [](const PySessionOptions* options) -> bool { return options->value.use_deterministic_compute; },
           [](PySessionOptions* options, bool use_deterministic_compute) -> void {
-              options->value.use_deterministic_compute = use_deterministic_compute;
+            options->value.use_deterministic_compute = use_deterministic_compute;
           },
           R"pbdoc(Whether to use deterministic compute. Default is false.)pbdoc")
       .def(
@@ -1477,15 +1481,16 @@ including arg name, arg type (contains both type and shape).)pbdoc")
   py::class_<PyInferenceSession>(m, "InferenceSession", R"pbdoc(This is the main class used to run a model.)pbdoc")
       // In Python3, a Python bytes object will be passed to C++ functions that accept std::string or char*
       // without any conversion. So this init method can be used for model file path (string) and model content (bytes)
-      .def(py::init([&env](const PySessionOptions& so, const std::string arg, bool is_arg_file_name,
-                           bool load_config_from_model = false) {
+      .def(py::init([](const PySessionOptions& so, const std::string arg, bool is_arg_file_name,
+                       bool load_config_from_model = false) {
+        auto env = GetEnv();
         std::unique_ptr<PyInferenceSession> sess;
 
         // separate creation of the session from model loading unless we have to read the config from the model.
         // in a minimal build we only support load via Load(...) and not at session creation time
         if (load_config_from_model) {
 #if !defined(ORT_MINIMAL_BUILD)
-          sess = std::make_unique<PyInferenceSession>(env, so, arg, is_arg_file_name);
+          sess = std::make_unique<PyInferenceSession>(std::move(env), so, arg, is_arg_file_name);
 
           RegisterCustomOpDomains(sess.get(), so);
 
@@ -1494,7 +1499,7 @@ including arg name, arg type (contains both type and shape).)pbdoc")
           ORT_THROW("Loading configuration from an ONNX model is not supported in this build.");
 #endif
         } else {
-          sess = std::make_unique<PyInferenceSession>(env, so);
+          sess = std::make_unique<PyInferenceSession>(std::move(env), so);
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS)
           RegisterCustomOpDomains(sess.get(), so);
 #endif
@@ -1600,21 +1605,10 @@ including arg name, arg type (contains both type and shape).)pbdoc")
         }
         return fetches;
       })
-      .def("run_with_ortvaluevector", [](
-        PyInferenceSession* sess,
-        RunOptions run_options,
-        const std::vector<std::string>& feed_names,
-        const std::vector<OrtValue>& feeds,
-        const std::vector<std::string>& fetch_names,
-        std::vector<OrtValue>& fetches,
-        const std::vector<OrtDevice>& fetch_devices) -> void {
-
-        {
-          // release GIL to allow multiple python threads to invoke Run() in parallel.
-          py::gil_scoped_release release;
-          OrtPybindThrowIfError(sess->GetSessionHandle()->Run(run_options, feed_names, feeds, fetch_names, &fetches, &fetch_devices));
-        }
-
+      .def("run_with_ortvaluevector", [](PyInferenceSession* sess, RunOptions run_options, const std::vector<std::string>& feed_names, const std::vector<OrtValue>& feeds, const std::vector<std::string>& fetch_names, std::vector<OrtValue>& fetches, const std::vector<OrtDevice>& fetch_devices) -> void {
+        // release GIL to allow multiple python threads to invoke Run() in parallel.
+        py::gil_scoped_release release;
+        OrtPybindThrowIfError(sess->GetSessionHandle()->Run(run_options, feed_names, feeds, fetch_names, &fetches, &fetch_devices));
       })
       .def("end_profiling", [](const PyInferenceSession* sess) -> std::string {
         return sess->GetSessionHandle()->EndProfiling();
@@ -1691,27 +1685,27 @@ including arg name, arg type (contains both type and shape).)pbdoc")
 
         return ret;
 #else
-        ORT_UNUSED_PARAMETER(sess);
-        ORT_THROW("TunableOp and get_tuning_results are not supported in this build.");
+            ORT_UNUSED_PARAMETER(sess);
+            ORT_THROW("TunableOp and get_tuning_results are not supported in this build.");
 #endif
       })
       .def("set_tuning_results", [](PyInferenceSession* sess, py::list results, bool error_on_invalid) -> void {
 #if !defined(ORT_MINIMAL_BUILD)
         std::vector<TuningResults> tuning_results;
-        for (auto handle: results) {
+        for (auto handle : results) {
           auto py_trs = handle.cast<py::dict>();
           TuningResults trs;
           trs.ep = py_trs["ep"].cast<py::str>();
 
-          for (const auto [py_op_sig, py_kernel_map]: py_trs["results"].cast<py::dict>()) {
+          for (const auto [py_op_sig, py_kernel_map] : py_trs["results"].cast<py::dict>()) {
             KernelMap kernel_map;
-            for (const auto [py_params_sig, py_kernel_id]: py_kernel_map.cast<py::dict>()) {
+            for (const auto [py_params_sig, py_kernel_id] : py_kernel_map.cast<py::dict>()) {
               kernel_map[py_params_sig.cast<py::str>()] = py_kernel_id.cast<py::int_>();
             }
             trs.results[py_op_sig.cast<py::str>()] = kernel_map;
           }
 
-          for (const auto [k, v]: py_trs["validators"].cast<py::dict>()) {
+          for (const auto [k, v] : py_trs["validators"].cast<py::dict>()) {
             trs.validators[k.cast<py::str>()] = v.cast<py::str>();
           }
 
@@ -1723,10 +1717,10 @@ including arg name, arg type (contains both type and shape).)pbdoc")
           throw std::runtime_error("Error in execution: " + status.ErrorMessage());
         }
 #else
-        ORT_UNUSED_PARAMETER(sess);
-        ORT_UNUSED_PARAMETER(results);
-        ORT_UNUSED_PARAMETER(error_on_invalid);
-        ORT_THROW("TunableOp and set_tuning_results are not supported in this build.");
+            ORT_UNUSED_PARAMETER(sess);
+            ORT_UNUSED_PARAMETER(results);
+            ORT_UNUSED_PARAMETER(error_on_invalid);
+            ORT_THROW("TunableOp and set_tuning_results are not supported in this build.");
 #endif
       });
 
@@ -1746,10 +1740,10 @@ void CreateInferencePybindStateModule(py::module& m) {
     import_array1();
   })();
 
-  Environment& env = GetEnv();
+  auto env = GetEnv();
 
-  addGlobalMethods(m, env);
-  addObjectMethods(m, env, RegisterExecutionProviders);
+  addGlobalMethods(m);
+  addObjectMethods(m, RegisterExecutionProviders);
   addOrtValueMethods(m);
   addSparseTensorMethods(m);
   addIoBindingMethods(m);
@@ -1775,34 +1769,77 @@ void InitArray() {
   })();
 }
 
-// static variable used to create inference session and training session.
-static std::unique_ptr<Environment> session_env;
+namespace {
+// This class provides a static shell for on-demand and thread-safe construction
+// of Environment object for both Inference and Training python layers.
+// Environment class contains objects such as default logger, that must be available
+// for the entire duration of a program that makes use of onnxruntime library.
+// Because Python is a garbage collected language and the order of destruction of objects
+// is not guaranteed we design this class with the following important features.
 
-void InitializeEnv() {
-  auto initialize = [&]() {
+// 1) we make this class a singleton that is a function local static. The function local statics
+//    are constructed when the function is called the very first time. This fact has several important
+//    properties.
+//    - First, it is constructed before it is first needed possibly by another static object
+//      and destroyed after that object is destroyed.
+//    - Second, it is constructed in a thread safe manner.
+//    - Last, this order of construction/destruction is enforced across the compilation units, as opposed
+//      to the static objects that are simply declared in order in a single unit, but their lifespan is
+//      unconnected to that of in other compilation units. This is achieved automatically by run-time
+//      by execution atexit() to build a chain.
+//  2) We make Environment owned by a shared_ptr. This is done because python objects such as Inference and Training
+//    sessions depend on this global. We acquire a shared_ptr instance when those objects are instantiated
+//    and release it automatically when they are garbage collected. Although with this change all of the
+//    globals seem to have been destroyed after module is unloaded and GC runs before that, it is cheap and gives
+//    a piece of mind as there were situations when GC was still running in the past after Env was gone.
+//    TrainingEnv global also holds shared reference to this global.
+// 3) We guard against singleton resurrection attempts to detect code runs that when it should
+//    not and make necessary adjustments.
+//    For all the related details and why it is needed see "Modern C++ design" by A. Alexandrescu Chapter 6.
+class EnvInitializer {
+ public:
+  static std::shared_ptr<onnxruntime::Environment> SharedInstance() {
+    // Guard against attempts to resurrect the singleton
+    if (EnvInitializer::destroyed) {
+      ORT_THROW("Detected an attempt to resurrect destroyed Environment");
+    }
+    static EnvInitializer env_holder;
+    return env_holder.Get();
+  }
+
+ private:
+  EnvInitializer() {
     // Initialization of the module
     InitArray();
+    std::unique_ptr<Environment> env_ptr;
     Env::Default().GetTelemetryProvider().SetLanguageProjection(OrtLanguageProjection::ORT_PROJECTION_PYTHON);
     OrtPybindThrowIfError(Environment::Create(std::make_unique<LoggingManager>(
                                                   std::make_unique<CLogSink>(),
                                                   Severity::kWARNING, false, LoggingManager::InstanceType::Default,
                                                   &SessionObjectInitializer::default_logger_id),
-                                              session_env));
-
-    static bool initialized = false;
-    if (initialized) {
-      return;
-    }
-    initialized = true;
-  };
-  initialize();
-}
-
-onnxruntime::Environment& GetEnv() {
-  if (!session_env) {
-    InitializeEnv();
+                                              env_ptr));
+    session_env_ = std::shared_ptr<Environment>(env_ptr.release());
+    destroyed = false;
   }
-  return *session_env;
+
+  ~EnvInitializer() {
+    destroyed = true;
+  }
+
+  std::shared_ptr<Environment> Get() const {
+    return session_env_;
+  }
+
+  std::shared_ptr<Environment> session_env_;
+
+  static bool destroyed;
+};
+
+bool EnvInitializer::destroyed = false;
+}  // namespace
+
+std::shared_ptr<onnxruntime::Environment> GetEnv() {
+  return EnvInitializer::SharedInstance();
 }
 
 }  // namespace python
diff --git a/onnxruntime/python/onnxruntime_pybind_state_common.h b/onnxruntime/python/onnxruntime_pybind_state_common.h
index add6c0afd1..6943501e0b 100644
--- a/onnxruntime/python/onnxruntime_pybind_state_common.h
+++ b/onnxruntime/python/onnxruntime_pybind_state_common.h
@@ -232,33 +232,36 @@ using PySessionOptions = OrtSessionOptions;
 
 // Thin wrapper over internal C++ InferenceSession to accommodate custom op library management for the Python user
 struct PyInferenceSession {
-  PyInferenceSession(Environment& env, const PySessionOptions& so) {
-    sess_ = std::make_unique<InferenceSession>(so.value, env);
+  PyInferenceSession(std::shared_ptr<Environment> env, const PySessionOptions& so)
+  : env_(std::move(env)) {
+    sess_ = std::make_unique<InferenceSession>(so.value, *env_);
   }
 
 #if !defined(ORT_MINIMAL_BUILD)
-  PyInferenceSession(Environment& env, const PySessionOptions& so, const std::string& arg, bool is_arg_file_name) {
+  PyInferenceSession(std::shared_ptr<Environment> env, const PySessionOptions& so, const std::string& arg, bool is_arg_file_name) 
+  : env_(std::move(env)) {
     if (is_arg_file_name) {
       // Given arg is the file path. Invoke the corresponding ctor().
-      sess_ = std::make_unique<InferenceSession>(so.value, env, arg);
+      sess_ = std::make_unique<InferenceSession>(so.value, *env_, arg);
     } else {
       // Given arg is the model content as bytes. Invoke the corresponding ctor().
       std::istringstream buffer(arg);
-      sess_ = std::make_unique<InferenceSession>(so.value, env, buffer);
+      sess_ = std::make_unique<InferenceSession>(so.value, *env_, buffer);
     }
   }
 #endif
 
   InferenceSession* GetSessionHandle() const { return sess_.get(); }
 
-  virtual ~PyInferenceSession() {}
+  virtual ~PyInferenceSession() = default;
 
  protected:
-  PyInferenceSession(std::unique_ptr<InferenceSession> sess) {
-    sess_ = std::move(sess);
+  PyInferenceSession(std::shared_ptr<Environment> env, std::unique_ptr<InferenceSession> sess) 
+  : env_(std::move(env)), sess_(std::move(sess)) {
   }
 
  private:
+  std::shared_ptr<Environment> env_;
   std::unique_ptr<InferenceSession> sess_;
 };
 
@@ -383,7 +386,7 @@ class SessionObjectInitializer {
 #if defined(_MSC_VER) && !defined(__clang__)
 #pragma warning(pop)
 #endif
-Environment& GetEnv();
+std::shared_ptr<Environment> GetEnv();
 
 // Initialize an InferenceSession.
 // Any provider_options should have entries in matching order to provider_types.
diff --git a/orttraining/orttraining/lazy_tensor/accelerator.cc b/orttraining/orttraining/lazy_tensor/accelerator.cc
index 05102b74c1..d1b407a5af 100644
--- a/orttraining/orttraining/lazy_tensor/accelerator.cc
+++ b/orttraining/orttraining/lazy_tensor/accelerator.cc
@@ -26,10 +26,6 @@
 
 namespace onnxruntime {
 
-namespace python {
-Environment& GetTrainingORTEnv();
-}
-
 namespace lazytensor {
 
 namespace py = pybind11;
@@ -304,11 +300,11 @@ static std::unique_ptr<onnxruntime::InferenceSession> CreateSession() {
 #ifdef USE_CUDA
   NvtxRange range(__func__);
 #endif
-  // Environment shared by all sessions.
-  static onnxruntime::Environment& pybind_default_env = onnxruntime::python::GetTrainingORTEnv();
   // All sessions use the same config.
   static onnxruntime::SessionOptions sess_opts;
-  return std::make_unique<onnxruntime::InferenceSession>(sess_opts, pybind_default_env);
+  // Query the singleton always, to make sure we detect shutdown
+  auto ort_env = onnxruntime::python::GetEnv();
+  return std::make_unique<onnxruntime::InferenceSession>(sess_opts, *ort_env);
 }
 
 static OrtDevice CheckAndGetTensorDevice(const at::ArrayRef<c10::IValue>& values) {
diff --git a/orttraining/orttraining/python/orttraining_pybind_common.h b/orttraining/orttraining/python/orttraining_pybind_common.h
index c3a5422c22..46fe3efd41 100644
--- a/orttraining/orttraining/python/orttraining_pybind_common.h
+++ b/orttraining/orttraining/python/orttraining_pybind_common.h
@@ -22,7 +22,7 @@ class ORTTrainingPythonEnv {
  public:
   ORTTrainingPythonEnv();
 
-  Environment& GetORTEnv();
+  std::shared_ptr<Environment> GetORTEnv() const;
 
   std::shared_ptr<IExecutionProvider> GetExecutionProviderInstance(const std::string& provider_type,
                                                                    size_t hash);
@@ -45,7 +45,7 @@ class ORTTrainingPythonEnv {
   std::string GetExecutionProviderMapKey(const std::string& provider_type,
                                          size_t hash);
 
-  std::unique_ptr<Environment> ort_env_;
+  std::shared_ptr<Environment> ort_env_;
   ExecutionProviderMap execution_provider_instances_map_;
   std::vector<std::string> available_training_eps_;
 };
diff --git a/orttraining/orttraining/python/orttraining_pybind_state.cc b/orttraining/orttraining/python/orttraining_pybind_state.cc
index 14a407dc0c..71c0fe5654 100644
--- a/orttraining/orttraining/python/orttraining_pybind_state.cc
+++ b/orttraining/orttraining/python/orttraining_pybind_state.cc
@@ -49,7 +49,6 @@ using namespace onnxruntime;
 using namespace onnxruntime::logging;
 using namespace onnxruntime::training;
 
-Environment& GetTrainingORTEnv();
 ORTTrainingPythonEnv& GetTrainingEnv();
 
 void ResolveExtraProviderOptions(const std::vector<std::string>& provider_types,
@@ -169,9 +168,13 @@ struct TrainingConfigurationResult {
 struct PyOptimizer {
   PyOptimizer(const std::string optimizer_model_uri,
               onnxruntime::training::api::Module* model, std::vector<std::shared_ptr<IExecutionProvider>> provider)
-      : optimizer_(std::make_unique<onnxruntime::training::api::Optimizer>(optimizer_model_uri,
-                                                                           model->NamedParameters(), onnxruntime::SessionOptions(),
-                                                                           GetTrainingORTEnv(), provider)) {
+      : optimizer_() {
+
+    auto env = GetTrainingEnv().GetORTEnv();
+    // XXX: We hope that env will be around when optimizer needs it.
+    optimizer_ = std::make_shared<onnxruntime::training::api::Optimizer>(optimizer_model_uri,
+                                                                         model->NamedParameters(), onnxruntime::SessionOptions(),
+                                                                         *env, provider);
   }
 
   std::shared_ptr<onnxruntime::training::api::Optimizer> optimizer_;
@@ -549,20 +552,21 @@ void addObjectMethodsForTraining(py::module& m, ExecutionProviderRegistrationFn
 
   // Thin wrapper over internal C++ InferenceSession to accommodate custom op library management for the Python user
   struct PyTrainingSession : public PyInferenceSession {
-    PyTrainingSession(Environment& env, const PySessionOptions& so)
-        : PyInferenceSession(std::make_unique<PipelineTrainingSession>(so.value, env)) {
+    PyTrainingSession(std::shared_ptr<Environment> env, const PySessionOptions& so)
+        : PyInferenceSession(env, std::make_unique<PipelineTrainingSession>(so.value, *env)) {
     }
+    ~PyTrainingSession() = default;
   };
 
   py::class_<PyTrainingSession, PyInferenceSession> training_session(m, "TrainingSession");
   training_session
       .def(py::init([](const PySessionOptions& so) {
-        Environment& env = GetTrainingORTEnv();
-        return std::make_unique<PyTrainingSession>(env, so);
+        auto& training_env = GetTrainingEnv();
+        return std::make_unique<PyTrainingSession>(training_env.GetORTEnv(), so);
       }))
       .def(py::init([]() {
-        Environment& env = GetTrainingORTEnv();
-        return std::make_unique<PyTrainingSession>(env, GetDefaultCPUSessionOptions());
+        auto& training_env = GetTrainingEnv();
+        return std::make_unique<PyTrainingSession>(training_env.GetORTEnv(), GetDefaultCPUSessionOptions());
       }))
       .def("finalize", [](py::object) {
 #if defined(USE_MPI)
@@ -876,10 +880,11 @@ void addObjectMethodsForTraining(py::module& m, ExecutionProviderRegistrationFn
         onnxruntime::SessionOptions session_option;
         std::vector<std::shared_ptr<IExecutionProvider>> provider = GetExecutionProvidersForTrainingApis(device);
 
+        auto env = GetTrainingEnv().GetORTEnv();
         return std::make_unique<onnxruntime::training::api::Module>(
             model_uri,
             state.module_checkpoint_state.named_parameters, session_option,
-            GetTrainingORTEnv(), provider, eval_model_uri);
+            *env, provider, eval_model_uri);
       }))
       .def("train_step",
            [](onnxruntime::training::api::Module* model,
diff --git a/orttraining/orttraining/python/orttraining_python_module.cc b/orttraining/orttraining/python/orttraining_python_module.cc
index f2dbb27072..40caa07684 100644
--- a/orttraining/orttraining/python/orttraining_python_module.cc
+++ b/orttraining/orttraining/python/orttraining_python_module.cc
@@ -36,8 +36,8 @@ const ROCMExecutionProviderInfo GetRocmExecutionProviderInfo(ProviderInfo_ROCM*
                                                              const ProviderOptionsMap& provider_options_map);
 #endif
 
-void addGlobalMethods(py::module& m, Environment& env);
-void addObjectMethods(py::module& m, Environment& env, ExecutionProviderRegistrationFn ep_registration_fn);
+void addGlobalMethods(py::module& m);
+void addObjectMethods(py::module& m, ExecutionProviderRegistrationFn ep_registration_fn);
 void addObjectMethodsForTraining(py::module& m, ExecutionProviderRegistrationFn ep_registration_fn);
 void addObjectMethodsForEager(py::module& m);
 #ifdef ENABLE_LAZY_TENSOR
@@ -133,18 +133,13 @@ bool GetProviderInstanceHash(const std::string& type,
   return false;
 }
 
-ORTTrainingPythonEnv::ORTTrainingPythonEnv() {
-  OrtPybindThrowIfError(Environment::Create(std::make_unique<LoggingManager>(
-                                                std::make_unique<CLogSink>(),
-                                                Severity::kWARNING, false, LoggingManager::InstanceType::Default,
-                                                &SessionObjectInitializer::default_logger_id),
-                                            ort_env_));
-  auto& builtinEPs = GetAvailableExecutionProviderNames();
+ORTTrainingPythonEnv::ORTTrainingPythonEnv() : ort_env_(GetEnv()) {
+  const auto& builtinEPs = GetAvailableExecutionProviderNames();
   available_training_eps_.assign(builtinEPs.begin(), builtinEPs.end());
 }
 
-Environment& ORTTrainingPythonEnv::GetORTEnv() {
-  return *ort_env_;
+std::shared_ptr<Environment> ORTTrainingPythonEnv::GetORTEnv() const {
+  return ort_env_;
 }
 
 std::shared_ptr<IExecutionProvider> ORTTrainingPythonEnv::GetExecutionProviderInstance(const std::string& provider_type,
@@ -183,37 +178,82 @@ void ORTTrainingPythonEnv::ClearExecutionProviderInstances() {
   execution_provider_instances_map_.clear();
 }
 
-static std::unique_ptr<ORTTrainingPythonEnv> ort_training_env;
+namespace {
 
-void InitializeTrainingEnv() {
-  auto initialize = [&]() {
-    static bool initialized = false;
-    if (initialized) {
-      return;
+// This class provides a static shell for on-demand and thread-safe construction
+// of ORTTrainingPythonEnv object for both Inference and Training python layers.
+// ORTTrainingPythonEnv class contains instances of execution providers that have been
+// instantiated for training purposes. It depends on the Environment singleton to which it
+// holds a shared_ptr instance.
+//
+// 1) we make this class a singleton that is a function local static. The function local statics
+//    are constructed when the function is called the very first time. This fact has several important
+//    properties. 
+//    - First, it is constructed before it is first needed possibly by another static object
+//      and destroyed after that object is destroyed.
+//    - Second, it is constructed in a thread safe manner.
+//    - Last, this order of construction/destruction is enforced across the compilation units, as opposed
+//      to the static objects that are simply declared in order in a single unit, but their lifespan is 
+//      unconnected to that of in other compilation units. This is achieved automatically by run-time
+//      by execution atexit() to build a chain.
+// 2) This ORTTrainingPythonEnv is currently owned by a unique_ptr unlike the Environment singleton. This is
+//    because we currently do not see a need to refer to it by any of the Python objects or by other singletons.
+//    With this change this singleton is properly destroyed after python module is unloaded, but before the Environment.
+//    HOWEVER, because it holds instances of execution providers, we want to make sure that those instances are destroyed
+//    before those depended EP DLLs are unloaded so EP destructor can run.
+//    This static is destroyed when this compilation unit is unloaded and it generally happens
+//    AFTER EP dlls are unloaded. To mitigate that, we clear EP instances using python `atexit` (different from C atexit())
+//    mechanism which takes place after all python objects are GCed but before any DLLs are unloaded or
+//    runtime starts destroying globals.
+// 3) We guard against singleton resurrection attempts to detect code that runs when it should not
+//    and make necessary adjustments.
+//    For all the related details and why it is needed see "Modern C++ design" by A. Alexandrescu Chapter 6.
+class TrainingEnvInitialzer {
+ public:
+
+  static ORTTrainingPythonEnv& Instance() {
+    // Guard against attempts to resurrect the singleton
+    if (TrainingEnvInitialzer::destroyed) {
+      ORT_THROW("Detected an attempt to resurrect destroyed Training Environment");
     }
-    // Initialization of the module
+
+    static TrainingEnvInitialzer training_env_holder;
+
+    return training_env_holder.Get();
+  }
+
+ private:
+
+  TrainingEnvInitialzer() {
     InitArray();
     Env::Default().GetTelemetryProvider().SetLanguageProjection(OrtLanguageProjection::ORT_PROJECTION_PYTHON);
-    ort_training_env = std::make_unique<ORTTrainingPythonEnv>();
-    initialized = true;
-  };
-  initialize();
-}
+    ort_training_env_ = std::make_unique<ORTTrainingPythonEnv>();
+  }
+
+  ~TrainingEnvInitialzer() {
+    destroyed = true;
+  }
+
+  ORTTrainingPythonEnv& Get() noexcept {
+    return *ort_training_env_;
+  }
+
+  std::unique_ptr<ORTTrainingPythonEnv> ort_training_env_;
+
+  static bool destroyed;
+};
+
+bool TrainingEnvInitialzer::destroyed = false;
+
+}  // namespace
 
 ORTTrainingPythonEnv& GetTrainingEnv() {
-  if (!ort_training_env) {
-    InitializeTrainingEnv();
-  }
-  return *ort_training_env;
-}
-
-Environment& GetTrainingORTEnv() {
-  if (!ort_training_env) {
-    InitializeTrainingEnv();
-  }
-  return ort_training_env->GetORTEnv();
+  return TrainingEnvInitialzer::Instance();
 }
 
+// TODO: If this global has a conflicting lifespan with other globals
+// such as Environment, follow the global objects management pattern for
+// Environment and ORTTrainingPythonEnv
 #ifdef ENABLE_EAGER_MODE
 using namespace torch_ort::eager;
 static std::unique_ptr<ORTBackendsManager> ort_backends_manager_instance;
@@ -225,8 +265,9 @@ void InitializeBackendsManager() {
       return;
     }
     // Initialization of the module
-    auto& env = onnxruntime::python::GetTrainingORTEnv();
-    ort_backends_manager_instance = std::make_unique<ORTBackendsManager>(env.GetLoggingManager()->DefaultLogger());
+    auto& training_env = onnxruntime::python::GetTrainingEnv();
+    auto env = training_env.GetORTEnv();
+    ort_backends_manager_instance = std::make_unique<ORTBackendsManager>(env->GetLoggingManager()->DefaultLogger());
     initialized = true;
   };
   initialize();
@@ -247,7 +288,7 @@ void ResolveExtraProviderOptions(const std::vector<std::string>& provider_types,
   for (auto& provider_type : provider_types) {
     auto it = training_env.ext_execution_provider_info_map_.find(provider_type);
     if (it == training_env.ext_execution_provider_info_map_.end()) {
-      //nothing changed.
+      // nothing changed.
       if (original_provider_options_map.find(provider_type) != original_provider_options_map.end())
         merged_options.insert({provider_type, original_provider_options_map.at(provider_type)});
     } else {
@@ -318,9 +359,10 @@ PYBIND11_MODULE(onnxruntime_pybind11_state, m) {
   m.doc() = "pybind11 stateful interface to ORTTraining";
   RegisterExceptions(m);
 
-  Environment& env = GetTrainingORTEnv();
-  addGlobalMethods(m, env);
-  addObjectMethods(m, env, ORTTrainingRegisterExecutionProviders);
+  // Instantiate singletons
+  GetTrainingEnv();
+  addGlobalMethods(m);
+  addObjectMethods(m, ORTTrainingRegisterExecutionProviders);
   addOrtValueMethods(m);
   addSparseTensorMethods(m);
   addIoBindingMethods(m);
@@ -357,19 +399,20 @@ PYBIND11_MODULE(onnxruntime_pybind11_state, m) {
 
   m.def("get_version_string", []() -> std::string { return ORT_VERSION; });
 
-  m.def("clear_training_ep_instances", []() -> void {
-    ort_training_env->ClearExecutionProviderInstances();
-  },
-        "Clean the execution provider instances used in ort training module.");
+  m.def(
+      "clear_training_ep_instances", []() -> void {
+        GetTrainingEnv().ClearExecutionProviderInstances();
+      },
+      "Clean the execution provider instances used in ort training module.");
 
-  // clean the ort training environment when python interpreter exit
-  // otherwise the global var will be de-constrcut after user main.
-  // the order of ort training environment deconstruction and cudart
-  // deconstruction is not stable, which will lead to crash.
+  // See documentation for class TrainingEnvInitialzer earlier in this module
+  // for an explanation as to why this is needed.
   auto atexit = py::module_::import("atexit");
   atexit.attr("register")(py::cpp_function([]() {
-    ort_training_env = nullptr;
+    GetTrainingEnv().ClearExecutionProviderInstances();
 #ifdef ENABLE_EAGER_MODE
+    // This singleton should also be re-factored into a function local static
+    // so its lifetime is properly managed.
     ort_backends_manager_instance = nullptr;
 #endif
   }));