From e89dd923875f8ad9ec66ea43db19cd637e0ccd1f Mon Sep 17 00:00:00 2001 From: Jeff Date: Tue, 14 Apr 2020 21:34:49 -0700 Subject: [PATCH] Flush and trim resources in DML EP in new OnSessionInitializationEnd method --- .../core/framework/execution_provider.h | 7 ++++ .../core/framework/execution_provider.cc | 2 + .../inc/DmlExecutionProvider.h | 2 - .../src/ExecutionProvider.cpp | 42 +++++++------------ .../src/ExecutionProvider.h | 29 +++++-------- onnxruntime/core/session/inference_session.cc | 8 ++++ winml/adapter/winml_adapter_apis.h | 1 - winml/adapter/winml_adapter_c_api.cpp | 1 - winml/adapter/winml_adapter_c_api.h | 8 ---- winml/adapter/winml_adapter_dml.cpp | 10 ----- winml/lib/Api.Ort/OnnxruntimeEngine.cpp | 13 ------ winml/lib/Api.Ort/OnnxruntimeEngine.h | 2 - winml/lib/Api/LearningModelSession.cpp | 10 ----- winml/lib/Api/LearningModelSession.h | 6 --- winml/lib/Common/inc/iengine.h | 3 -- 15 files changed, 41 insertions(+), 103 deletions(-) diff --git a/include/onnxruntime/core/framework/execution_provider.h b/include/onnxruntime/core/framework/execution_provider.h index e56f4bb19d..e3315d0118 100644 --- a/include/onnxruntime/core/framework/execution_provider.h +++ b/include/onnxruntime/core/framework/execution_provider.h @@ -136,6 +136,13 @@ class IExecutionProvider { */ virtual common::Status OnRunEnd(); + /** + Called when session creation is complete + This provides an opportunity for execution providers to optionally synchronize and + clean up its temporary resources to reduce memory and ensure the first run is fast. + */ + virtual common::Status OnSessionInitializationEnd(); + void InsertAllocator(AllocatorPtr allocator); /** diff --git a/onnxruntime/core/framework/execution_provider.cc b/onnxruntime/core/framework/execution_provider.cc index c59e0a5309..ea726e1309 100644 --- a/onnxruntime/core/framework/execution_provider.cc +++ b/onnxruntime/core/framework/execution_provider.cc @@ -49,6 +49,8 @@ common::Status IExecutionProvider::OnRunStart() { return Status::OK(); } common::Status IExecutionProvider::OnRunEnd() { return Status::OK(); } +common::Status IExecutionProvider::OnSessionInitializationEnd() { return Status::OK(); } + void IExecutionProvider::InsertAllocator(AllocatorPtr allocator) { const OrtMemoryInfo& info = allocator->Info(); const int key = MakeKey(info.id, info.mem_type); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h index 05f4d94307..9dfbd0e7ea 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/DmlExecutionProvider.h @@ -34,8 +34,6 @@ namespace Dml void FlushContext(onnxruntime::IExecutionProvider* provider); void SetDefaultRoundingMode(onnxruntime::IExecutionProvider* provider, AllocatorRoundingMode roundingMode); void ReleaseCompletedReferences(onnxruntime::IExecutionProvider* provider); - void TrimUploadHeap(onnxruntime::IExecutionProvider * provider); - void WaitForGpuCompletion(onnxruntime::IExecutionProvider * provider); onnxruntime::common::Status CopyTensor( onnxruntime::IExecutionProvider* provider, diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp index d7428c698b..5bc4f9654b 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp @@ -532,17 +532,6 @@ namespace Dml return onnxruntime::common::Status::OK(); } - Status ExecutionProviderImpl::WaitForGpuCompletion() - { - assert(!m_closed); - - Flush(); - m_context->GetCurrentCompletionEvent().WaitForSignal(); - m_context->ReleaseCompletedReferences(); - - return Status::OK(); - } - void __stdcall ExecutionProviderImpl::Flush() const { assert(!m_closed); @@ -558,11 +547,6 @@ namespace Dml { m_context->ReleaseCompletedReferences(); } - - void ExecutionProviderImpl::TrimUploadHeap() - { - m_uploadHeap->Trim(); - } void ExecutionProviderImpl::QueueReference(IUnknown* object) { @@ -701,6 +685,20 @@ namespace Dml return m_cpuOutputAllocator; } + + onnxruntime::common::Status ExecutionProviderImpl::OnSessionInitializationEnd() + { + // Flush and trim resources, including staging memory used to upload weights. + // This reduces memory usage immediately after session creation, and avoids + // performance impact of deallocation during first evaluation. + Flush(); + m_context->GetCurrentCompletionEvent().WaitForSignal(); + m_context->ReleaseCompletedReferences(); + m_uploadHeap->Trim(); + + return onnxruntime::common::Status::OK(); + } + std::unique_ptr CreateExecutionProvider( IDMLDevice* dmlDevice, ID3D12CommandQueue* commandQueue, @@ -733,18 +731,6 @@ namespace Dml dmlexecutionprovider->ReleaseCompletedReferences(); } - void TrimUploadHeap(onnxruntime::IExecutionProvider * provider) - { - ExecutionProvider* dmlexecutionprovider = static_cast(provider); - dmlexecutionprovider->TrimUploadHeap(); - } - - void WaitForGpuCompletion(onnxruntime::IExecutionProvider * provider) - { - ExecutionProvider* dmlexecutionprovider = static_cast(provider); - dmlexecutionprovider->WaitForGpuCompletion(); - } - onnxruntime::common::Status CopyTensor( onnxruntime::IExecutionProvider* provider, const onnxruntime::Tensor& src, diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h index 58f73f62ba..ca852d4ac9 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h @@ -39,8 +39,6 @@ namespace Dml void ReleaseCompletedReferences(); - void TrimUploadHeap(); - public: // implements Dml::IExecutionProvider STDMETHOD(GetD3DDevice)(_COM_Outptr_ ID3D12Device** d3dDevice) const noexcept final; @@ -92,7 +90,6 @@ namespace Dml uint32_t GetSuppportedDeviceDataTypeMask() const; onnxruntime::common::Status CopyTensor(const onnxruntime::Tensor& src, onnxruntime::Tensor& dst) const; - onnxruntime::common::Status WaitForGpuCompletion(); // IWinmlExecutionProvider methods void QueueReference(IUnknown* object) override; @@ -157,7 +154,9 @@ namespace Dml std::shared_ptr GetCpuOutputAllocator(); std::shared_ptr - GetInternalRegistrationInfoMap() const; + GetInternalRegistrationInfoMap() const; + + onnxruntime::common::Status OnSessionInitializationEnd(); private: void Initialize(ID3D12CommandQueue* queue, ExecutionProvider& executionProvider); @@ -221,31 +220,28 @@ namespace Dml bool enableMetacommands = true ); - std::unique_ptr GetDataTransfer() const final + std::unique_ptr GetDataTransfer() const final override { return std::make_unique(m_impl.Get()); } - const void* GetExecutionHandle() const noexcept final + const void* GetExecutionHandle() const noexcept final override { return m_impl.Get(); } - std::shared_ptr GetKernelRegistry() const final + std::shared_ptr GetKernelRegistry() const final override { return m_impl->GetKernelRegistry(); } std::vector> GetCapability(const onnxruntime::GraphViewer& graph, - const std::vector& kernel_registries) const final; + const std::vector& kernel_registries) const final override; - // Not to be confused with IExecutionProvider::Sync() const. The DML provider handles - // synchronization when copying inputs and outputs, therefore doesn't override the - // default ORT method, which does nothin. - onnxruntime::common::Status WaitForGpuCompletion() - { - return m_impl->WaitForGpuCompletion(); + onnxruntime::common::Status OnSessionInitializationEnd() override + { + return m_impl->OnSessionInitializationEnd(); } void Flush() @@ -262,11 +258,6 @@ namespace Dml { return m_impl->ReleaseCompletedReferences(); } - - void TrimUploadHeap() - { - m_impl->TrimUploadHeap(); - } ExecutionProviderImpl* GetImpl() { diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc index 5aa30f927f..7a662117b3 100644 --- a/onnxruntime/core/session/inference_session.cc +++ b/onnxruntime/core/session/inference_session.cc @@ -929,6 +929,14 @@ common::Status InferenceSession::Initialize() { if (session_profiler_.IsEnabled()) { session_profiler_.EndTimeAndRecordEvent(profiling::SESSION_EVENT, "session_initialization", tp); } + + if (status.IsOK()) { + auto retval = status; + for (auto& xp : execution_providers_) { + auto status = xp->OnSessionInitializationEnd(); + ORT_CHECK_AND_SET_RETVAL(status); + } + } return status; } diff --git a/winml/adapter/winml_adapter_apis.h b/winml/adapter/winml_adapter_apis.h index a4477c264a..cb289eb617 100644 --- a/winml/adapter/winml_adapter_apis.h +++ b/winml/adapter/winml_adapter_apis.h @@ -58,7 +58,6 @@ ORT_API_STATUS(SessionCopyOneInputAcrossDevices, _In_ OrtSession* session, _In_ // Dml methods (TODO need to figure out how these need to move to session somehow...) ORT_API_STATUS(DmlExecutionProviderSetDefaultRoundingMode, _In_ OrtExecutionProvider* dml_provider, _In_ bool is_enabled); ORT_API_STATUS(DmlExecutionProviderFlushContext, _In_ OrtExecutionProvider* dml_provider); -ORT_API_STATUS(DmlExecutionProviderTrimUploadHeap, _In_ OrtExecutionProvider* dml_provider); ORT_API_STATUS(DmlExecutionProviderReleaseCompletedReferences, _In_ OrtExecutionProvider* dml_provider); ORT_API_STATUS(DmlCreateGPUAllocationFromD3DResource, _In_ ID3D12Resource* pResource, _Out_ void** dml_resource); ORT_API_STATUS(DmlGetD3D12ResourceFromAllocation, _In_ OrtExecutionProvider* provider, _In_ void* allocation, _Out_ ID3D12Resource** resource); diff --git a/winml/adapter/winml_adapter_c_api.cpp b/winml/adapter/winml_adapter_c_api.cpp index 5b6e443c34..8e64d60d07 100644 --- a/winml/adapter/winml_adapter_c_api.cpp +++ b/winml/adapter/winml_adapter_c_api.cpp @@ -59,7 +59,6 @@ static constexpr WinmlAdapterApi winml_adapter_api_1 = { // Dml methods (TODO need to figure out how these need to move to session somehow...) &winmla::DmlExecutionProviderSetDefaultRoundingMode, &winmla::DmlExecutionProviderFlushContext, - &winmla::DmlExecutionProviderTrimUploadHeap, &winmla::DmlExecutionProviderReleaseCompletedReferences, &winmla::DmlCreateGPUAllocationFromD3DResource, &winmla::DmlFreeGPUAllocation, diff --git a/winml/adapter/winml_adapter_c_api.h b/winml/adapter/winml_adapter_c_api.h index 76dec8f49a..8bbbf310e8 100644 --- a/winml/adapter/winml_adapter_c_api.h +++ b/winml/adapter/winml_adapter_c_api.h @@ -303,14 +303,6 @@ struct WinmlAdapterApi { */ OrtStatus*(ORT_API_CALL* DmlExecutionProviderFlushContext)(_In_ OrtExecutionProvider* dml_provider)NO_EXCEPTION; - /** - * DmlExecutionProviderTrimUploadHeap - * This api is used to trim the upload heap in the DML EP. - * - * WinML communicates directly with DML to perform this as an optimization. - */ - OrtStatus*(ORT_API_CALL* DmlExecutionProviderTrimUploadHeap)(_In_ OrtExecutionProvider* dml_provider)NO_EXCEPTION; - /** * DmlExecutionProviderReleaseCompletedReferences * This api is used to release completed references after first run the DML EP. diff --git a/winml/adapter/winml_adapter_dml.cpp b/winml/adapter/winml_adapter_dml.cpp index 235ebbf4d0..5574bf11b8 100644 --- a/winml/adapter/winml_adapter_dml.cpp +++ b/winml/adapter/winml_adapter_dml.cpp @@ -93,16 +93,6 @@ ORT_API_STATUS_IMPL(winmla::DmlExecutionProviderFlushContext, _In_ OrtExecutionP API_IMPL_END } -ORT_API_STATUS_IMPL(winmla::DmlExecutionProviderTrimUploadHeap, _In_ OrtExecutionProvider* dml_provider) { - API_IMPL_BEGIN -#ifdef USE_DML - auto dml_provider_internal = reinterpret_cast<::onnxruntime::IExecutionProvider*>(dml_provider); - Dml::TrimUploadHeap(dml_provider_internal); -#endif // USE_DML - return nullptr; - API_IMPL_END -} - ORT_API_STATUS_IMPL(winmla::DmlExecutionProviderReleaseCompletedReferences, _In_ OrtExecutionProvider* dml_provider) { API_IMPL_BEGIN #ifdef USE_DML diff --git a/winml/lib/Api.Ort/OnnxruntimeEngine.cpp b/winml/lib/Api.Ort/OnnxruntimeEngine.cpp index 0ed27cdb8f..c9dbd3d9c7 100644 --- a/winml/lib/Api.Ort/OnnxruntimeEngine.cpp +++ b/winml/lib/Api.Ort/OnnxruntimeEngine.cpp @@ -493,19 +493,6 @@ HRESULT OnnxruntimeEngine::FlushContext() { return S_OK; } -HRESULT OnnxruntimeEngine::TrimUploadHeap() { - auto winml_adapter_api = engine_factory_->UseWinmlAdapterApi(); - - OrtExecutionProvider* ort_provider; - RETURN_HR_IF_NOT_OK_MSG(winml_adapter_api->SessionGetExecutionProvider(session_.get(), 0, &ort_provider), - engine_factory_->UseOrtApi()); - - RETURN_HR_IF_NOT_OK_MSG(winml_adapter_api->DmlExecutionProviderTrimUploadHeap(ort_provider), - engine_factory_->UseOrtApi()); - - return S_OK; -} - HRESULT OnnxruntimeEngine::ReleaseCompletedReferences() { auto winml_adapter_api = engine_factory_->UseWinmlAdapterApi(); diff --git a/winml/lib/Api.Ort/OnnxruntimeEngine.h b/winml/lib/Api.Ort/OnnxruntimeEngine.h index 74a5945ce6..0a073556e1 100644 --- a/winml/lib/Api.Ort/OnnxruntimeEngine.h +++ b/winml/lib/Api.Ort/OnnxruntimeEngine.h @@ -76,8 +76,6 @@ class OnnxruntimeEngine : public Microsoft::WRL::RuntimeClass< () override; STDMETHOD(FlushContext) () override; - STDMETHOD(TrimUploadHeap) - () override; STDMETHOD(ReleaseCompletedReferences) () override; STDMETHOD(Sync) diff --git a/winml/lib/Api/LearningModelSession.cpp b/winml/lib/Api/LearningModelSession.cpp index ba6ea76d56..c135857ae8 100644 --- a/winml/lib/Api/LearningModelSession.cpp +++ b/winml/lib/Api/LearningModelSession.cpp @@ -277,16 +277,6 @@ LearningModelSession::GetResults( // Update output providers auto outputs = binding_impl->UpdateProviders(); - // Once the first evaluation following initialization is complete, and therefore the - // initialization work is also complete, trim the upload heap. This is only done once - // to avoid requiring the extra allocation during each evaluation. - if (is_first_evaluate_) { - if (is_gpu_evaluation) { - engine_->TrimUploadHeap(); - } - is_first_evaluate_ = false; - } - // Create the return status object auto result = winrt::make(); auto result_impl = result.as(); diff --git a/winml/lib/Api/LearningModelSession.h b/winml/lib/Api/LearningModelSession.h index 1a27784b49..c926b3381d 100644 --- a/winml/lib/Api/LearningModelSession.h +++ b/winml/lib/Api/LearningModelSession.h @@ -121,12 +121,6 @@ struct LearningModelSession : LearningModelSessionT { // Synchronization CWinMLLock session_creation_lock_; CWinMLLock dml_ep_lock_; - - // is_first_evaluate_ is used as a heuristic to determine - // when the dml upload heap can be trimmed. - bool is_first_evaluate_ = true; - - }; } // namespace winrt::Windows::AI::MachineLearning::implementation diff --git a/winml/lib/Common/inc/iengine.h b/winml/lib/Common/inc/iengine.h index fd165b3488..b6b39e841e 100644 --- a/winml/lib/Common/inc/iengine.h +++ b/winml/lib/Common/inc/iengine.h @@ -99,9 +99,6 @@ IEngine : IUnknown { STDMETHOD(FlushContext) () PURE; - STDMETHOD(TrimUploadHeap) - () PURE; - STDMETHOD(ReleaseCompletedReferences) () PURE;