From 81cd6eacd0121711c2e6e6f8d1fc0cefcd81de99 Mon Sep 17 00:00:00 2001 From: Adrian Lizarraga Date: Mon, 23 Dec 2024 10:02:04 -0800 Subject: [PATCH] [QNN EP] Fix multithread sync bug in ETW callback (#23156) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Description Fixes crash in QNN dlls when an ETW callback tries to change the QNN log level. This is caused by a function that does not lock a mutex before modifying the QNN log level. ### Motivation and Context An ETW callback into QNN EP leads to a crash within QNN SDK dlls. It happens approximately 1 out of 3 full QNN unit tests runs. The cause is a multithreading synchronization bug in QNN EP. We're not always locking a mutex when ETW calls QNN EP to notify of ETW config change. There are two branches in the QNN EP callback function that try to update the QNN log handle. One branch correctly locks a mutex, but other does not lock it at all. This causes crashes within QNN dlls. - Does not lock mutex: [onnxruntime/onnxruntime/core/providers/qnn/qnn_execution_provider.cc at main · microsoft/onnxruntime](https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/providers/qnn/qnn_execution_provider.cc#L426) - Locks mutex: [onnxruntime/onnxruntime/core/providers/qnn/qnn_execution_provider.cc at main · microsoft/onnxruntime](https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/providers/qnn/qnn_execution_provider.cc#L442) The fix is to lock the mutex in both paths. --- .../qnn/builder/qnn_backend_manager.cc | 46 ++++++++++++------- .../qnn/builder/qnn_backend_manager.h | 46 ++++++++----------- .../providers/qnn/qnn_execution_provider.cc | 4 +- 3 files changed, 49 insertions(+), 47 deletions(-) diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc index 3af646c3ce..fa38fad8ee 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc @@ -255,7 +255,9 @@ void QnnLogging(const char* format, } } -Status QnnBackendManager::InitializeQnnLog() { +Status QnnBackendManager::InitializeQnnLog(const logging::Logger& logger) { + logger_ = &logger; + // Set Qnn log level align with Ort log level auto ort_log_level = logger_->GetSeverity(); QnnLog_Level_t qnn_log_level = MapOrtSeverityToQNNLogLevel(ort_log_level); @@ -303,23 +305,15 @@ QnnLog_Level_t QnnBackendManager::MapOrtSeverityToQNNLogLevel(logging::Severity } } -Status QnnBackendManager::ResetQnnLogLevel() { +Status QnnBackendManager::ResetQnnLogLevel(std::optional ort_log_level) { std::lock_guard lock(logger_mutex_); - - if (backend_setup_completed_ && logger_ != nullptr) { - auto ort_log_level = logger_->GetSeverity(); - LOGS(*logger_, INFO) << "Reset Qnn log level to ORT Logger level: " << (unsigned int)ort_log_level; - return UpdateQnnLogLevel(ort_log_level); + if (!backend_setup_completed_ || logger_ == nullptr) { + return Status::OK(); } - return Status::OK(); -} - -Status QnnBackendManager::UpdateQnnLogLevel(logging::Severity ort_log_level) { ORT_RETURN_IF(nullptr == log_handle_, "Unable to update QNN Log Level. Invalid QNN log handle."); - ORT_RETURN_IF(false == backend_setup_completed_, "Unable to update QNN Log Level. Backend setup not completed."); - ORT_RETURN_IF(nullptr == logger_, "Unable to update QNN Log Level. Invalid logger."); - QnnLog_Level_t qnn_log_level = MapOrtSeverityToQNNLogLevel(ort_log_level); + logging::Severity actual_log_level = ort_log_level.has_value() ? *ort_log_level : logger_->GetSeverity(); + QnnLog_Level_t qnn_log_level = MapOrtSeverityToQNNLogLevel(actual_log_level); LOGS(*logger_, INFO) << "Updating Qnn log level to: " << qnn_log_level; @@ -332,7 +326,8 @@ Status QnnBackendManager::UpdateQnnLogLevel(logging::Severity ort_log_level) { LOGS(*logger_, ERROR) << "Invalid log handle provided to QnnLog_setLogLevel."; } } - ORT_RETURN_IF(QNN_BACKEND_NO_ERROR != result, "Failed to set log level in Qnn backend. Error: ", QnnErrorHandleToString(result)); + ORT_RETURN_IF(QNN_BACKEND_NO_ERROR != result, + "Failed to set log level in Qnn backend. Error: ", QnnErrorHandleToString(result)); return Status::OK(); } @@ -823,7 +818,7 @@ Status QnnBackendManager::SetupBackend(const logging::Logger& logger, LOGS(logger, VERBOSE) << "Backend build version: " << sdk_build_version_; - SetLogger(&logger); + ORT_RETURN_IF_ERROR(InitializeQnnLog(logger)); LOGS(logger, VERBOSE) << "SetLogger succeed."; ORT_RETURN_IF_ERROR(InitializeBackend()); @@ -1049,6 +1044,24 @@ Status QnnBackendManager::DestroyHTPPowerConfigID(uint32_t htp_power_config_id) return Status::OK(); } +Status QnnBackendManager::TerminateQnnLog() { + std::lock_guard lock(logger_mutex_); + if (logger_ == nullptr) { + return Status::OK(); + } + + if (nullptr != qnn_interface_.logFree && nullptr != log_handle_) { + auto ret_val = qnn_interface_.logFree(log_handle_); + + // Reset QNN log handle to nullptr so other threads that are waiting on logger_mutex_ know it was freed. + log_handle_ = nullptr; + ORT_RETURN_IF(QNN_SUCCESS != ret_val, + "Unable to terminate logging in the backend."); + } + + return Status::OK(); +} + void QnnBackendManager::ReleaseResources() { if (!backend_setup_completed_) { return; @@ -1074,7 +1087,6 @@ void QnnBackendManager::ReleaseResources() { ORT_THROW("Failed to ShutdownBackend."); } - std::lock_guard lock(logger_mutex_); result = TerminateQnnLog(); if (Status::OK() != result) { ORT_THROW("Failed to TerminateQnnLog."); diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h index b145f2a2cd..beabc9bd71 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h +++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h @@ -96,6 +96,8 @@ class QnnBackendManager { std::unordered_map>& qnn_models, int64_t max_spill_fill_size); + // Initializes handles to QNN resources (device, logger, etc.). + // NOTE: This function locks the internal `logger_mutex_`. Status SetupBackend(const logging::Logger& logger, bool load_from_cached_context, bool need_load_system_lib); Status CreateHtpPowerCfgId(uint32_t deviceId, uint32_t coreId, uint32_t& htp_power_config_id); @@ -121,34 +123,10 @@ class QnnBackendManager { const Qnn_ProfileHandle_t& GetQnnProfileHandle() { return profile_backend_handle_; } - void SetLogger(const logging::Logger* logger) { - if (logger_ == nullptr) { - logger_ = logger; - (void)InitializeQnnLog(); - } - } - - Status InitializeQnnLog(); - - Status UpdateQnnLogLevel(logging::Severity ort_log_level); - - Status ResetQnnLogLevel(); - - // Terminate logging in the backend - Status TerminateQnnLog() { - if (logger_ == nullptr) { - return Status::OK(); - } - - if (nullptr != qnn_interface_.logFree && nullptr != log_handle_) { - ORT_RETURN_IF(QNN_SUCCESS != qnn_interface_.logFree(log_handle_), - "Unable to terminate logging in the backend."); - } - - return Status::OK(); - } - - void ReleaseResources(); + // Resets the QNN log level to the given ORT log level or to the default log level if the argument is + // std::nullopt. + // NOTE: This function locks the internal `logger_mutex_`. + Status ResetQnnLogLevel(std::optional ort_log_level = std::nullopt); Status ExtractBackendProfilingInfo(); Status ExtractProfilingSubEvents(QnnProfile_EventId_t profile_event_id, std::ofstream& outfile, @@ -171,6 +149,18 @@ class QnnBackendManager { uint64_t& max_spill_fill_buffer_size); private: + // Sets the ORT logger and creates a corresponding QNN logger with the same log level. + // NOTE: caller must lock the `logger_mutex_` before calling this function. + Status InitializeQnnLog(const logging::Logger& logger); + + // Terminate logging in the backend + // NOTE: This function locks the internal `logger_mutex_`. + Status TerminateQnnLog(); + + // Releases all QNN resources. Called in the destructor. + // NOTE: This function indirectly locks the internal `logger_mutex_` via nested function calls. + void ReleaseResources(); + void* LoadLib(const char* file_name, int flags, std::string& error_msg); Status LoadQnnSystemLib(); diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc index 27e195dea7..1d9242f8a5 100644 --- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc +++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc @@ -423,7 +423,7 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio if (IsEnabled == EVENT_CONTROL_CODE_ENABLE_PROVIDER) { if ((MatchAnyKeyword & static_cast(onnxruntime::logging::ORTTraceLoggingKeyword::Logs)) != 0) { auto ortETWSeverity = etwRegistrationManager.MapLevelToSeverity(); - (void)qnn_backend_manager_->UpdateQnnLogLevel(ortETWSeverity); + (void)qnn_backend_manager_->ResetQnnLogLevel(ortETWSeverity); } if ((MatchAnyKeyword & static_cast(onnxruntime::logging::ORTTraceLoggingKeyword::Profiling)) != 0) { if (Level != 0) { @@ -439,7 +439,7 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio if (IsEnabled == EVENT_CONTROL_CODE_DISABLE_PROVIDER) { // (void)qnn_backend_manager_->SetProfilingLevelETW(qnn::ProfilingLevel::INVALID); - (void)qnn_backend_manager_->ResetQnnLogLevel(); + (void)qnn_backend_manager_->ResetQnnLogLevel(std::nullopt); } }); etwRegistrationManager.RegisterInternalCallback(callback_ETWSink_provider_);