// Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. #include "lib/Api/pch/pch.h" #include "LearningModelSession.h" #include "ImageFeatureDescriptor.h" #include "LearningModel.h" #include "LearningModelBinding.h" #include "LearningModelEvaluationResult.h" #include "LearningModelDevice.h" #include "LearningModelSessionOptions.h" #include "TensorFeatureDescriptor.h" #include "TelemetryEvent.h" #include "D3DDeviceCache.h" static const auto c_enable_debug_output = L"EnableDebugOutput"; namespace guid_details { // This GUID is to be used for delimiting ML-related categories of capturable work. // {D113B493-BBA2-4993-8608-D706A73B91CE} struct __declspec(uuid("D113B493-BBA2-4993-8608-D706A73B91CE")) __declspec(novtable) WINML_PIX_EVAL_CAPTURABLE_WORK_GUID {}; } // namespace guid_details static const GUID WINML_PIX_EVAL_CAPTURABLE_WORK_GUID = __uuidof(guid_details::WINML_PIX_EVAL_CAPTURABLE_WORK_GUID); namespace WINMLP { LearningModelSession::LearningModelSession(_winml::IEngine* engine) : operator_registry_(nullptr, nullptr), model_(nullptr), device_(LearningModelDeviceKind::Cpu), session_options_(nullptr) { engine_.copy_from(engine); } LearningModelSession::LearningModelSession(winml::LearningModel const& model) try : LearningModelSession(model, make(LearningModelDeviceKind::Default)) { } WINML_CATCH_ALL LearningModelSession::LearningModelSession( winml::LearningModel const& model, winml::LearningModelDevice const& deviceToRunOn ) try : LearningModelSession(model, deviceToRunOn, nullptr) { } WINML_CATCH_ALL LearningModelSession::LearningModelSession( winml::LearningModel const& model, winml::LearningModelDevice const& deviceToRunOn, winml::LearningModelSessionOptions const& learningModelSessionOptions ) try : operator_registry_(nullptr, nullptr), model_(model), device_(deviceToRunOn), session_options_(learningModelSessionOptions) { Initialize(); } WINML_CATCH_ALL _winml::IModel* LearningModelSession::GetOptimizedModel() { // Get the model proto auto should_close_model = session_options_ != nullptr && session_options_.CloseModelOnSessionCreation(); return GetOptimizedModel(should_close_model); } _winml::IModel* LearningModelSession::GetOptimizedModel(bool should_close_model) { com_ptr<_winml::IModel> model; { // Lock the model detach/copy since multiple threads can access concurrently CWinMLAutoLock lock(&session_creation_lock_); // Throw if the model has been disposed and is not capable of creating // new sessions. auto model_impl = model_.as(); WINML_THROW_HR_IF_TRUE_MSG(E_INVALIDARG, model_impl->IsDisposed(), "The model has been disposed."); model.attach(should_close_model ? model_impl->DetachModel() : model_impl->CloneModel()); } // Ensure that the model is runnable on the device auto isFloat16Supported = device_.as()->GetD3DDeviceCache()->IsFloat16Supported(); if (!isFloat16Supported) { WINML_THROW_IF_FAILED(model->ModelEnsureNoFloat16()); } return model.detach(); } void LearningModelSession::Initialize() { // Begin recording session creation telemetry _winmlt::TelemetryEvent session_creation_event(_winmlt::EventCategory::kSessionCreation); // Get the optimized model proto from the learning model com_ptr<_winml::IModel> model; model.attach(GetOptimizedModel()); // Create the session builder auto device_impl = device_.as(); auto model_impl = model_.as(); engine_factory_.copy_from(model_impl->GetEngineFactory()); com_ptr<_winml::IEngineBuilder> engine_builder; WINML_THROW_IF_FAILED(engine_factory_->CreateEngineBuilder(engine_builder.put())); if (device_impl->IsCpuDevice() == false) { WINML_THROW_IF_FAILED(engine_builder->SetD3D12Resources(device_impl->GetD3DDevice(), device_impl->GetDeviceQueue()) ); WINML_THROW_IF_FAILED(engine_builder->SetMetacommandsEnabled(device_impl->MetacommandsEnabled())); } auto num_intra_op_threads = device_impl->NumberOfIntraOpThreads(); auto allow_spinning = device_impl->AllowSpinning(); // Make onnxruntime apply the batch size override, if any if (session_options_) { if (session_options_.BatchSizeOverride() != 0) { WINML_THROW_IF_FAILED(engine_builder->SetBatchSizeOverride(session_options_.BatchSizeOverride())); } com_ptr session_options_impl = session_options_.as(); // Make onnxruntime apply named dimension overrides, if any if (session_options_impl && session_options_impl->NamedDimensionOverrides().Size() > 0) { WINML_THROW_IF_FAILED(engine_builder->SetNamedDimensionOverrides(session_options_impl->NamedDimensionOverrides()) ); } allow_spinning = session_options_impl->GetIntraOpThreadSpinning(); num_intra_op_threads = session_options_impl->GetIntraOpNumThreads(); const auto& paths = session_options_impl->GetCustomOpLibraryPaths(); for (const auto& path : paths) { auto path_str = _winml::Strings::UTF8FromHString(path); WINML_THROW_IF_FAILED(engine_builder->RegisterCustomOpsLibrary(path_str.c_str())); } } bool create_local_thread_pool = allow_spinning != device_impl->AllowSpinning() || num_intra_op_threads != device_impl->NumberOfIntraOpThreads(); if (create_local_thread_pool) { WINML_THROW_IF_FAILED(engine_builder->SetIntraOpThreadSpinning(allow_spinning)); WINML_THROW_IF_FAILED(engine_builder->SetIntraOpNumThreadsOverride(num_intra_op_threads)); } else { winrt::com_ptr<_winml::IThreading> thread_pool = nullptr; WINML_THROW_IF_FAILED(device_impl->GetThreadPool(thread_pool.put())); if (thread_pool == nullptr) { WINML_THROW_IF_FAILED(engine_factory_->CreateThreadPool(allow_spinning, num_intra_op_threads, thread_pool.put())); WINML_THROW_IF_FAILED(device_impl->CacheThreadPool(thread_pool.get())); } WINML_THROW_IF_FAILED(engine_builder->SetThreadPool(thread_pool.get())); } com_ptr<_winml::IEngine> engine; WINML_THROW_IF_FAILED(engine_builder->CreateEngine(engine.put())); // Register the custom operator registry operator_registry_ = MLOperatorRegistry(model_impl->GetOperatorRegistry(), [](auto registry) { registry->Release(); }); WINML_THROW_IF_FAILED(engine->RegisterCustomRegistry(operator_registry_.get())); // Register transformers - this should probably not be exposed on IEngine, but an internal call as this configuration step is ort specific. WINML_THROW_IF_FAILED(engine->RegisterGraphTransformers()); // Load the model into the session WINML_THROW_IF_FAILED(engine->LoadModel(model.get())); // the session owns the model_proto now, it used detach() model = nullptr; // Initialize the session WINML_THROW_IF_FAILED(engine->Initialize()); // Cache the constructed session engine_ = engine; } wfc::IPropertySet LearningModelSession::EvaluationProperties() try { if (evaluation_properties_ == nullptr) { evaluation_properties_ = wfc::PropertySet(); } return evaluation_properties_; } WINML_CATCH_ALL winml::LearningModel LearningModelSession::Model() try { return model_; } WINML_CATCH_ALL winml::LearningModelDevice LearningModelSession::Device() try { return device_; } WINML_CATCH_ALL winml::LearningModelSessionOptions LearningModelSession::Options() try { return session_options_; } WINML_CATCH_ALL auto CreateBinding(LearningModelSession& session, wfc::IMap const features) { auto binding = winrt::make(session); for (auto feature : features.GetView()) { binding.Bind(feature.Key(), feature.Value()); } return binding; } winml::LearningModelEvaluationResult LearningModelSession::EvaluateFeatures( wfc::IMap const features, hstring const correlation_id ) try { auto binding = CreateBinding(*this, features); return Evaluate(binding, correlation_id); } WINML_CATCH_ALL wf::IAsyncOperation LearningModelSession::EvaluateFeaturesAsync( wfc::IMap const features, hstring const correlation_id ) { auto binding = CreateBinding(*this, features); return EvaluateAsync(binding, correlation_id); } uint64_t LearningModelSession::Run(winrt::com_ptr binding_impl) { CheckClosed(); // if this is being called on the GPU, grab the DML lock // the DML EP is not thread safe. auto device = device_.as(); CWinMLAutoLock lock(!device->IsCpuDevice() ? GetDMLEPLock() : nullptr); binding_impl->BindUnboundOutputs(); auto& input_names = binding_impl->GetInputNames(); std::vector input_names_raw; std::transform(std::begin(input_names), std::end(input_names), std::back_inserter(input_names_raw), [&](auto& name) { return name.c_str(); }); auto& inputs = binding_impl->GetInputs(); std::vector<_winml::IValue*> inputs_raw; std::transform(std::begin(inputs), std::end(inputs), std::back_inserter(inputs_raw), [&](auto& input) { return input.get(); }); auto& output_names = binding_impl->GetOutputNames(); std::vector output_names_raw; std::transform( std::begin(output_names), std::end(output_names), std::back_inserter(output_names_raw), [&](auto& name) { return name.c_str(); } ); auto outputs = binding_impl->GetOutputs(); std::vector<_winml::IValue*> outputs_raw; std::transform(std::begin(outputs), std::end(outputs), std::back_inserter(outputs_raw), [&](auto& input) { return input.get(); }); WINML_THROW_IF_FAILED(engine_->Run( input_names_raw.data(), inputs_raw.data(), input_names_raw.size(), output_names_raw.data(), outputs_raw.data(), output_names_raw.size() )); if (!device->IsCpuDevice()) { // Flush the D3D12 work from the DML execution provider and queue a fence before we release the lock. // This allows us to wait without holding onto the lock in GetResults. WINML_THROW_IF_FAILED(engine_->FlushContext()); return device->GetD3DDeviceCache()->QueueFenceToD3D12(); } // If it's the cpu then just return zero. fence value will be unused. return 0; } winml::LearningModelEvaluationResult LearningModelSession::GetResults( winrt::com_ptr binding_impl, hstring const& correlation_id, uint64_t evaluation_complete_fence ) { // First wait on the fence value for the expected frame. This is passed in so that // the fence value is added to the queue in a thread safe manor. auto device = device_.as(); auto is_gpu_evaluation = !device->IsCpuDevice(); if (is_gpu_evaluation) { device->GetD3DDeviceCache()->WaitForFenceValue(evaluation_complete_fence); } // if this is being called on the GPU, grab the DML lock // the DML EP is not thread safe. CWinMLAutoLock lock(is_gpu_evaluation ? GetDMLEPLock() : nullptr); if (is_gpu_evaluation) { // For DML we aren't using the Sync function because we want to make fencing the // completed frame thread safe while not holding the lock while waiting for the gpu. WINML_THROW_IF_FAILED(engine_->ReleaseCompletedReferences()); } else { // For CPU call the standard Sync function WINML_THROW_IF_FAILED(engine_->Sync()); } // This isn't the best we are holding the lock while we wait for detensorize on the GPU. // Update output providers auto outputs = binding_impl->UpdateProviders(); // Create the return status object auto result = winrt::make(); auto result_impl = result.as(); result_impl->Succeeded(true); result_impl->ErrorStatus(0); result_impl->CorrelationId(correlation_id); result_impl->SetOutputs(std::move(outputs)); return result; } wf::IAsyncOperation LearningModelSession::EvaluateAsync( winml::LearningModelBinding binding, hstring const correlation_id ) { _winmlt::TelemetryEvent kEvaluateModel_event(_winmlt::EventCategory::kEvaluation); auto device = device_.as(); // Get the binding collection auto binding_impl = binding.as(); ApplyEvaluationProperties(); // If we're running on the CPU, then return now and process the rest in the background. // If we're running on the GPU, then queue up the work first (fast) and wait for the // results (slow) in the background. bool should_queue_work = (!device->IsCpuDevice()); if (!should_queue_work) { co_await resume_background(); } com_ptr queue; queue.copy_from(device->GetDeviceQueue()); com_ptr capture_interface = queue.try_as(); // markers for PIX debugging if (capture_interface != nullptr) { capture_interface->BeginCapturableWork(WINML_PIX_EVAL_CAPTURABLE_WORK_GUID); } // call Run synchronously on the calling thread to queue up the work uint64_t evaluation_complete_fence = Run(binding_impl); // markers for PIX debugging if (capture_interface) { capture_interface->EndCapturableWork(WINML_PIX_EVAL_CAPTURABLE_WORK_GUID); } // after the work is queued, return to the caller if (should_queue_work) { // Queue detensorization co_await resume_background(); } // Get the Results on a background thread whenever they're ready co_return GetResults(binding_impl, correlation_id, evaluation_complete_fence); } winml::LearningModelEvaluationResult LearningModelSession::Evaluate( winml::LearningModelBinding binding, hstring const& correlation_id ) try { ToggleProfiler(); _winmlt::TelemetryEvent kEvaluateModel_event(_winmlt::EventCategory::kEvaluation); ApplyEvaluationProperties(); auto device = device_.as(); com_ptr queue; queue.copy_from(device->GetDeviceQueue()); com_ptr capture_interface = queue.try_as(); // markers for PIX debugging if (capture_interface != nullptr) { capture_interface->BeginCapturableWork(WINML_PIX_EVAL_CAPTURABLE_WORK_GUID); } // Get the binding collection auto binding_impl = binding.as(); uint64_t evaluation_complete_fence = Run(binding_impl); // markers for PIX debugging if (capture_interface) { capture_interface->EndCapturableWork(WINML_PIX_EVAL_CAPTURABLE_WORK_GUID); } return GetResults(binding_impl, correlation_id, evaluation_complete_fence); } WINML_CATCH_ALL void LearningModelSession::Close() { engine_ = nullptr; } void LearningModelSession::ApplyEvaluationProperties() try { if (evaluation_properties_) { auto is_debug_output_enabled = evaluation_properties_.HasKey(c_enable_debug_output); if (is_debug_output_enabled) { engine_factory_->EnableDebugOutput(is_debug_output_enabled); } } } WINML_CATCH_ALL void LearningModelSession::ToggleProfiler() { CheckClosed(); auto is_provider_enabled = TraceLoggingProviderEnabled( ::winml_trace_logging_provider, WINEVENT_LEVEL_VERBOSE, WINML_PROVIDER_KEYWORD_LOTUS_PROFILING ); if (is_provider_enabled) { engine_->StartProfiling(); } else { engine_->EndProfiling(); } } _winml::IEngine* LearningModelSession::GetEngine() { return engine_.get(); } void LearningModelSession::CheckClosed() { if (!engine_) { WINML_THROW_HR(RO_E_CLOSED); } } STDMETHODIMP LearningModelSession::GetIntraOpNumThreads(uint32_t* numThreads) { return engine_->GetNumberOfIntraOpThreads(numThreads); } STDMETHODIMP LearningModelSession::GetIntraOpThreadSpinning(boolean* allowSpinning) { bool allowSpinningBool; RETURN_IF_FAILED(engine_->GetIntraOpThreadSpinning(&allowSpinningBool)); *allowSpinning = static_cast(allowSpinningBool); return S_OK; } winml::LearningModelSession LearningModelSession::CreateInertSession(_winml::IEngine* engine) { return winrt::make(engine); } } // namespace WINMLP