From e89dd923875f8ad9ec66ea43db19cd637e0ccd1f Mon Sep 17 00:00:00 2001
From: Jeff <jeffbloo@outlook.com>
Date: Tue, 14 Apr 2020 21:34:49 -0700
Subject: [PATCH] Flush and trim resources in DML EP in new
 OnSessionInitializationEnd method

---
 .../core/framework/execution_provider.h       |  7 ++++
 .../core/framework/execution_provider.cc      |  2 +
 .../inc/DmlExecutionProvider.h                |  2 -
 .../src/ExecutionProvider.cpp                 | 42 +++++++------------
 .../src/ExecutionProvider.h                   | 29 +++++--------
 onnxruntime/core/session/inference_session.cc |  8 ++++
 winml/adapter/winml_adapter_apis.h            |  1 -
 winml/adapter/winml_adapter_c_api.cpp         |  1 -
 winml/adapter/winml_adapter_c_api.h           |  8 ----
 winml/adapter/winml_adapter_dml.cpp           | 10 -----
 winml/lib/Api.Ort/OnnxruntimeEngine.cpp       | 13 ------
 winml/lib/Api.Ort/OnnxruntimeEngine.h         |  2 -
 winml/lib/Api/LearningModelSession.cpp        | 10 -----
 winml/lib/Api/LearningModelSession.h          |  6 ---
 winml/lib/Common/inc/iengine.h                |  3 --
 15 files changed, 41 insertions(+), 103 deletions(-)

diff --git a/include/onnxruntime/core/framework/execution_provider.h b/include/onnxruntime/core/framework/execution_provider.h
index e56f4bb19d..e3315d0118 100644
--- a/include/onnxruntime/core/framework/execution_provider.h
+++ b/include/onnxruntime/core/framework/execution_provider.h
@@ -136,6 +136,13 @@ class IExecutionProvider {
   */
   virtual common::Status OnRunEnd();
 
+  /**
+     Called when session creation is complete
+     This provides an opportunity for execution providers to optionally synchronize and
+     clean up its temporary resources to reduce memory and ensure the first run is fast.
+  */
+  virtual common::Status OnSessionInitializationEnd();
+
   void InsertAllocator(AllocatorPtr allocator);
 
   /**
diff --git a/onnxruntime/core/framework/execution_provider.cc b/onnxruntime/core/framework/execution_provider.cc
index c59e0a5309..ea726e1309 100644
--- a/onnxruntime/core/framework/execution_provider.cc
+++ b/onnxruntime/core/framework/execution_provider.cc
@@ -49,6 +49,8 @@ common::Status IExecutionProvider::OnRunStart() { return Status::OK(); }
 
 common::Status IExecutionProvider::OnRunEnd() { return Status::OK(); }
 
+common::Status IExecutionProvider::OnSessionInitializationEnd() { return Status::OK(); }
+
 void IExecutionProvider::InsertAllocator(AllocatorPtr allocator) {
   const OrtMemoryInfo& info = allocator->Info();
   const int key = MakeKey(info.id, info.mem_type);
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h
index 05f4d94307..9dfbd0e7ea 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h
@@ -34,8 +34,6 @@ namespace Dml
     void FlushContext(onnxruntime::IExecutionProvider* provider);    
     void SetDefaultRoundingMode(onnxruntime::IExecutionProvider* provider, AllocatorRoundingMode roundingMode);
     void ReleaseCompletedReferences(onnxruntime::IExecutionProvider* provider);
-    void TrimUploadHeap(onnxruntime::IExecutionProvider * provider);
-    void WaitForGpuCompletion(onnxruntime::IExecutionProvider * provider);
     
     onnxruntime::common::Status CopyTensor(
         onnxruntime::IExecutionProvider* provider, 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
index d7428c698b..5bc4f9654b 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -532,17 +532,6 @@ namespace Dml
         return onnxruntime::common::Status::OK();
     }
 
-    Status ExecutionProviderImpl::WaitForGpuCompletion()
-    {
-        assert(!m_closed);
-
-        Flush();
-        m_context->GetCurrentCompletionEvent().WaitForSignal();
-        m_context->ReleaseCompletedReferences();
-
-        return Status::OK();
-    }
-
     void __stdcall ExecutionProviderImpl::Flush() const
     {
         assert(!m_closed);
@@ -558,11 +547,6 @@ namespace Dml
     {
          m_context->ReleaseCompletedReferences();
     }
-    
-    void ExecutionProviderImpl::TrimUploadHeap()
-    {
-        m_uploadHeap->Trim();
-    }
 
     void ExecutionProviderImpl::QueueReference(IUnknown* object) 
     {
@@ -701,6 +685,20 @@ namespace Dml
         return m_cpuOutputAllocator;
     }
 
+    
+    onnxruntime::common::Status ExecutionProviderImpl::OnSessionInitializationEnd() 
+    {
+        // Flush and trim resources, including staging memory used to upload weights.
+        // This reduces memory usage immediately after session creation, and avoids
+        // performance impact of deallocation during first evaluation.
+        Flush();
+        m_context->GetCurrentCompletionEvent().WaitForSignal();
+        m_context->ReleaseCompletedReferences();
+        m_uploadHeap->Trim();
+
+        return onnxruntime::common::Status::OK();
+    }
+
     std::unique_ptr<onnxruntime::IExecutionProvider> CreateExecutionProvider(
         IDMLDevice* dmlDevice,
         ID3D12CommandQueue* commandQueue,
@@ -733,18 +731,6 @@ namespace Dml
         dmlexecutionprovider->ReleaseCompletedReferences();
     }
 
-    void TrimUploadHeap(onnxruntime::IExecutionProvider * provider)
-    {
-        ExecutionProvider* dmlexecutionprovider = static_cast<Dml::ExecutionProvider*>(provider);
-        dmlexecutionprovider->TrimUploadHeap();
-    }
-
-    void WaitForGpuCompletion(onnxruntime::IExecutionProvider * provider)
-    {
-        ExecutionProvider* dmlexecutionprovider = static_cast<Dml::ExecutionProvider*>(provider);
-        dmlexecutionprovider->WaitForGpuCompletion();
-    }
-
     onnxruntime::common::Status CopyTensor(
         onnxruntime::IExecutionProvider* provider, 
         const onnxruntime::Tensor& src, 
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
index 58f73f62ba..ca852d4ac9 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
@@ -39,8 +39,6 @@ namespace Dml
 
         void ReleaseCompletedReferences();
 
-        void TrimUploadHeap();
-
     public: // implements Dml::IExecutionProvider
         STDMETHOD(GetD3DDevice)(_COM_Outptr_ ID3D12Device** d3dDevice) const noexcept final;
 
@@ -92,7 +90,6 @@ namespace Dml
         uint32_t GetSuppportedDeviceDataTypeMask() const;
 
         onnxruntime::common::Status CopyTensor(const onnxruntime::Tensor& src, onnxruntime::Tensor& dst) const;
-        onnxruntime::common::Status WaitForGpuCompletion();
 
         // IWinmlExecutionProvider methods
         void QueueReference(IUnknown* object) override;
@@ -157,7 +154,9 @@ namespace Dml
         std::shared_ptr<onnxruntime::IAllocator> GetCpuOutputAllocator();
 
         std::shared_ptr<const Windows::AI::MachineLearning::Adapter::InternalRegistrationInfoMap> 
-        GetInternalRegistrationInfoMap() const;
+        GetInternalRegistrationInfoMap() const;        
+        
+        onnxruntime::common::Status OnSessionInitializationEnd();
 
     private:
         void Initialize(ID3D12CommandQueue* queue, ExecutionProvider& executionProvider);
@@ -221,31 +220,28 @@ namespace Dml
             bool enableMetacommands = true
         );
         
-        std::unique_ptr<onnxruntime::IDataTransfer> GetDataTransfer() const final
+        std::unique_ptr<onnxruntime::IDataTransfer> GetDataTransfer() const final override
         {
             return std::make_unique<DataTransfer>(m_impl.Get());
         }
 
-        const void* GetExecutionHandle() const noexcept final
+        const void* GetExecutionHandle() const noexcept final override
         {
             return m_impl.Get();
         }
 
-        std::shared_ptr<onnxruntime::KernelRegistry> GetKernelRegistry() const final
+        std::shared_ptr<onnxruntime::KernelRegistry> GetKernelRegistry() const final override
         {
             return m_impl->GetKernelRegistry();
         }
 
         std::vector<std::unique_ptr<onnxruntime::ComputeCapability>>
             GetCapability(const onnxruntime::GraphViewer& graph,
-                const std::vector<const onnxruntime::KernelRegistry*>& kernel_registries) const final;
+                const std::vector<const onnxruntime::KernelRegistry*>& kernel_registries) const final override;
 
-        // Not to be confused with IExecutionProvider::Sync() const.  The DML provider handles 
-        // synchronization when copying inputs and outputs, therefore doesn't override the 
-        // default ORT method, which does nothin.
-        onnxruntime::common::Status WaitForGpuCompletion()
-        {
-            return m_impl->WaitForGpuCompletion();
+        onnxruntime::common::Status OnSessionInitializationEnd() override
+        { 
+            return m_impl->OnSessionInitializationEnd();
         }
 
         void Flush()
@@ -262,11 +258,6 @@ namespace Dml
         {
             return m_impl->ReleaseCompletedReferences();
         }
-
-        void TrimUploadHeap()
-        {
-            m_impl->TrimUploadHeap();
-        }
         
         ExecutionProviderImpl* GetImpl()
         {
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index 5aa30f927f..7a662117b3 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -929,6 +929,14 @@ common::Status InferenceSession::Initialize() {
   if (session_profiler_.IsEnabled()) {
     session_profiler_.EndTimeAndRecordEvent(profiling::SESSION_EVENT, "session_initialization", tp);
   }
+
+  if (status.IsOK()) {
+    auto retval = status;
+    for (auto& xp : execution_providers_) {
+      auto status = xp->OnSessionInitializationEnd();
+      ORT_CHECK_AND_SET_RETVAL(status);
+    }
+  }
   return status;
 }
 
diff --git a/winml/adapter/winml_adapter_apis.h b/winml/adapter/winml_adapter_apis.h
index a4477c264a..cb289eb617 100644
--- a/winml/adapter/winml_adapter_apis.h
+++ b/winml/adapter/winml_adapter_apis.h
@@ -58,7 +58,6 @@ ORT_API_STATUS(SessionCopyOneInputAcrossDevices, _In_ OrtSession* session, _In_
 // Dml methods (TODO need to figure out how these need to move to session somehow...)
 ORT_API_STATUS(DmlExecutionProviderSetDefaultRoundingMode, _In_ OrtExecutionProvider* dml_provider, _In_ bool is_enabled);
 ORT_API_STATUS(DmlExecutionProviderFlushContext, _In_ OrtExecutionProvider* dml_provider);
-ORT_API_STATUS(DmlExecutionProviderTrimUploadHeap, _In_ OrtExecutionProvider* dml_provider);
 ORT_API_STATUS(DmlExecutionProviderReleaseCompletedReferences, _In_ OrtExecutionProvider* dml_provider);
 ORT_API_STATUS(DmlCreateGPUAllocationFromD3DResource, _In_ ID3D12Resource* pResource, _Out_ void** dml_resource);
 ORT_API_STATUS(DmlGetD3D12ResourceFromAllocation, _In_ OrtExecutionProvider* provider, _In_ void* allocation, _Out_ ID3D12Resource** resource);
diff --git a/winml/adapter/winml_adapter_c_api.cpp b/winml/adapter/winml_adapter_c_api.cpp
index 5b6e443c34..8e64d60d07 100644
--- a/winml/adapter/winml_adapter_c_api.cpp
+++ b/winml/adapter/winml_adapter_c_api.cpp
@@ -59,7 +59,6 @@ static constexpr WinmlAdapterApi winml_adapter_api_1 = {
     // Dml methods (TODO need to figure out how these need to move to session somehow...)
     &winmla::DmlExecutionProviderSetDefaultRoundingMode,
     &winmla::DmlExecutionProviderFlushContext,
-    &winmla::DmlExecutionProviderTrimUploadHeap,
     &winmla::DmlExecutionProviderReleaseCompletedReferences,
     &winmla::DmlCreateGPUAllocationFromD3DResource,
     &winmla::DmlFreeGPUAllocation,
diff --git a/winml/adapter/winml_adapter_c_api.h b/winml/adapter/winml_adapter_c_api.h
index 76dec8f49a..8bbbf310e8 100644
--- a/winml/adapter/winml_adapter_c_api.h
+++ b/winml/adapter/winml_adapter_c_api.h
@@ -303,14 +303,6 @@ struct WinmlAdapterApi {
     */
   OrtStatus*(ORT_API_CALL* DmlExecutionProviderFlushContext)(_In_ OrtExecutionProvider* dml_provider)NO_EXCEPTION;
 
-  /**
-    * DmlExecutionProviderTrimUploadHeap
-	 * This api is used to trim the upload heap in the DML EP.
-    * 
-    * WinML communicates directly with DML to perform this as an optimization.
-    */
-  OrtStatus*(ORT_API_CALL* DmlExecutionProviderTrimUploadHeap)(_In_ OrtExecutionProvider* dml_provider)NO_EXCEPTION;
-
   /**
     * DmlExecutionProviderReleaseCompletedReferences
 	 * This api is used to release completed references after first run the DML EP.
diff --git a/winml/adapter/winml_adapter_dml.cpp b/winml/adapter/winml_adapter_dml.cpp
index 235ebbf4d0..5574bf11b8 100644
--- a/winml/adapter/winml_adapter_dml.cpp
+++ b/winml/adapter/winml_adapter_dml.cpp
@@ -93,16 +93,6 @@ ORT_API_STATUS_IMPL(winmla::DmlExecutionProviderFlushContext, _In_ OrtExecutionP
   API_IMPL_END
 }
 
-ORT_API_STATUS_IMPL(winmla::DmlExecutionProviderTrimUploadHeap, _In_ OrtExecutionProvider* dml_provider) {
-  API_IMPL_BEGIN
-#ifdef USE_DML
-  auto dml_provider_internal = reinterpret_cast<::onnxruntime::IExecutionProvider*>(dml_provider);
-  Dml::TrimUploadHeap(dml_provider_internal);
-#endif  // USE_DML
-  return nullptr;
-  API_IMPL_END
-}
-
 ORT_API_STATUS_IMPL(winmla::DmlExecutionProviderReleaseCompletedReferences, _In_ OrtExecutionProvider* dml_provider) {
   API_IMPL_BEGIN
 #ifdef USE_DML
diff --git a/winml/lib/Api.Ort/OnnxruntimeEngine.cpp b/winml/lib/Api.Ort/OnnxruntimeEngine.cpp
index 0ed27cdb8f..c9dbd3d9c7 100644
--- a/winml/lib/Api.Ort/OnnxruntimeEngine.cpp
+++ b/winml/lib/Api.Ort/OnnxruntimeEngine.cpp
@@ -493,19 +493,6 @@ HRESULT OnnxruntimeEngine::FlushContext() {
   return S_OK;
 }
 
-HRESULT OnnxruntimeEngine::TrimUploadHeap() {
-  auto winml_adapter_api = engine_factory_->UseWinmlAdapterApi();
-
-  OrtExecutionProvider* ort_provider;
-  RETURN_HR_IF_NOT_OK_MSG(winml_adapter_api->SessionGetExecutionProvider(session_.get(), 0, &ort_provider),
-                          engine_factory_->UseOrtApi());
-
-  RETURN_HR_IF_NOT_OK_MSG(winml_adapter_api->DmlExecutionProviderTrimUploadHeap(ort_provider),
-                          engine_factory_->UseOrtApi());
-
-  return S_OK;
-}
-
 HRESULT OnnxruntimeEngine::ReleaseCompletedReferences() {
   auto winml_adapter_api = engine_factory_->UseWinmlAdapterApi();
 
diff --git a/winml/lib/Api.Ort/OnnxruntimeEngine.h b/winml/lib/Api.Ort/OnnxruntimeEngine.h
index 74a5945ce6..0a073556e1 100644
--- a/winml/lib/Api.Ort/OnnxruntimeEngine.h
+++ b/winml/lib/Api.Ort/OnnxruntimeEngine.h
@@ -76,8 +76,6 @@ class OnnxruntimeEngine : public Microsoft::WRL::RuntimeClass<
   () override;
   STDMETHOD(FlushContext)
   () override;
-  STDMETHOD(TrimUploadHeap)
-  () override;
   STDMETHOD(ReleaseCompletedReferences)
   () override;
   STDMETHOD(Sync)
diff --git a/winml/lib/Api/LearningModelSession.cpp b/winml/lib/Api/LearningModelSession.cpp
index ba6ea76d56..c135857ae8 100644
--- a/winml/lib/Api/LearningModelSession.cpp
+++ b/winml/lib/Api/LearningModelSession.cpp
@@ -277,16 +277,6 @@ LearningModelSession::GetResults(
   // Update output providers
   auto outputs = binding_impl->UpdateProviders();
 
-  // Once the first evaluation following initialization is complete, and therefore the
-  // initialization work is also complete, trim the upload heap. This is only done once
-  // to avoid requiring the extra allocation during each evaluation.
-  if (is_first_evaluate_) {
-    if (is_gpu_evaluation) {
-      engine_->TrimUploadHeap();
-    }
-    is_first_evaluate_ = false;
-  }
-
   // Create the return status object
   auto result = winrt::make<LearningModelEvaluationResult>();
   auto result_impl = result.as<winmlp::LearningModelEvaluationResult>();
diff --git a/winml/lib/Api/LearningModelSession.h b/winml/lib/Api/LearningModelSession.h
index 1a27784b49..c926b3381d 100644
--- a/winml/lib/Api/LearningModelSession.h
+++ b/winml/lib/Api/LearningModelSession.h
@@ -121,12 +121,6 @@ struct LearningModelSession : LearningModelSessionT<LearningModelSession> {
   // Synchronization
   CWinMLLock session_creation_lock_;
   CWinMLLock dml_ep_lock_;
-
-  // is_first_evaluate_ is used as a heuristic to determine
-  // when the dml upload heap can be trimmed.
-  bool is_first_evaluate_ = true;
-
-
 };
 
 }  // namespace winrt::Windows::AI::MachineLearning::implementation
diff --git a/winml/lib/Common/inc/iengine.h b/winml/lib/Common/inc/iengine.h
index fd165b3488..b6b39e841e 100644
--- a/winml/lib/Common/inc/iengine.h
+++ b/winml/lib/Common/inc/iengine.h
@@ -99,9 +99,6 @@ IEngine : IUnknown {
   STDMETHOD(FlushContext)
   () PURE;
 
-  STDMETHOD(TrimUploadHeap)
-  () PURE;
-
   STDMETHOD(ReleaseCompletedReferences)
   () PURE;