diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/base_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/base_op_builder.cc
index 7a18679329..d0c615c6ac 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/base_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/base_op_builder.cc
@@ -53,6 +53,7 @@ Status BaseOpBuilder::AddToModelBuilder(ModelBuilder& model_builder, const NodeU
   };
   ORT_RETURN_IF_NOT(IsOpSupported(model_builder.GetInitializerTensors(), node_unit, params),
                     "Unsupported operator ", node_unit.OpType());
+  model_builder.SetDebugTrackNode(node_unit.Index());
   ORT_RETURN_IF_ERROR(AddToModelBuilderImpl(model_builder, node_unit));
   LOGS_DEFAULT(VERBOSE) << "Operator name: [" << node_unit.Name()
                         << "] type: [" << node_unit.OpType() << "] was added";
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
index e8432cfc01..9e8f494eb7 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
@@ -28,9 +28,10 @@ namespace onnxruntime {
 namespace nnapi {
 
 ModelBuilder::ModelBuilder(const GraphViewer& graph_viewer, const NnApi& nnapi_handle,
-                           gsl::span<const DeviceWrapper> nnapi_target_devices)
+                           gsl::span<const DeviceWrapper> nnapi_target_devices,
+                           TargetDeviceOption target_device_option)
     : nnapi_(nnapi_handle), graph_viewer_(graph_viewer), nnapi_model_{std::make_unique<Model>(nnapi_handle)},
-      shaper_{graph_viewer}, nnapi_target_devices_(nnapi_target_devices),
+      shaper_{graph_viewer}, nnapi_target_devices_(nnapi_target_devices), target_device_option_(target_device_option),
       nnapi_effective_feature_level_(GetNNAPIEffectiveFeatureLevel(nnapi_handle, nnapi_target_devices_)) {
   nnapi_model_->nnapi_effective_feature_level_ = nnapi_effective_feature_level_;
 }
@@ -475,6 +476,9 @@ Status ModelBuilder::AddOperations() {
 Status ModelBuilder::AddOperation(int op, const InlinedVector<uint32_t>& input_indices,
                                   const std::vector<std::string>& output_names,
                                   const std::vector<OperandType>& output_types) {
+#ifndef NDEBUG
+  operations_recorder_.emplace_back(track_node_index_, op);
+#endif
   InlinedVector<uint32_t> output_indices;
   for (size_t i = 0; i < output_types.size(); i++) {
     uint32_t index = 0;
@@ -548,47 +552,52 @@ Status ModelBuilder::Compile(std::unique_ptr<Model>& model) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
                              "The model cannot run using current set of target devices, ",
                              GetDevicesDescription(nnapi_target_devices_));
-
-    } else {
+    // workaround for bugs in Android OS. sometimes ops are passed checking but failed at compilation.
+    }
+    // else // no else after return
+    if (target_device_option_ != TargetDeviceOption::ALL_DEVICES) {
       use_create_for_devices = true;
     }
   }
 
 #ifndef NDEBUG
   if (nnapi_target_devices_.size() > 1 && nnapi_target_devices_.back().type == ANEURALNETWORKS_DEVICE_CPU) {
-    auto* supported_ops = supported_ops_holder.get();
+    auto supported_ops = gsl::make_span(supported_ops_holder.get(), num_nnapi_ops_);
     RETURN_STATUS_ON_ERROR_WITH_NOTE(
         nnapi_.ANeuralNetworksModel_getSupportedOperationsForDevices(
             nnapi_model_->model_, device_handles.data(),
-            static_cast<uint32_t>(device_handles.size() - 1), supported_ops),
+            static_cast<uint32_t>(device_handles.size() - 1), supported_ops.data()),
         "on getSupportedOperationsForDevices");
 
-    std::unordered_map<std::string, int32_t> optype_support;
-    const auto& node_indices = graph_viewer_.GetNodesInTopologicalOrder();
-    for (size_t idx = 0; idx < node_indices.size(); idx++) {
-      auto node_idx = node_indices[idx];
-      const auto* node(graph_viewer_.GetNode(node_idx));
+    ORT_ENFORCE(num_nnapi_ops_==operations_recorder_.size(), "num_nnapi_ops_!=operations_recorder_.size()");
+    std::unordered_map<std::string, std::pair<int32_t, int32_t>> optype_support_status;
+    for (size_t idx = 0; idx < operations_recorder_.size(); idx++) {
+      auto [onnx_node_idx, nnapi_idx] = operations_recorder_[idx];
+      const auto* node(graph_viewer_.GetNode(onnx_node_idx));
+      auto stat_name = node->OpType() + ".nnapi_op_" + std::to_string(nnapi_idx);
+
       if (!supported_ops[idx]) {
-        optype_support[node->OpType()]++;
+        optype_support_status[stat_name].first++;
       } else {
-        optype_support[node->OpType()]--;
+        optype_support_status[stat_name].second++;
       }
     }
     size_t total_ops = 0;
-    std::string fb_op_alloc_detail, nm_op_alloc_detail;
+    std::string fallback_op_detail, normal_op_detail;
 
-    for (const auto& [op, count] : optype_support) {
-      if (count > 0) {
-        total_ops += count;
-        fb_op_alloc_detail += std::to_string(count) + "x " + op + ",";
-      } else {
-        nm_op_alloc_detail += std::to_string(-count) + "x " + op + ",";
+    for (const auto& [op, ops_status] : optype_support_status) {
+      auto& [support_cnt, unspport_cnt] = ops_status;
+      total_ops += support_cnt + unspport_cnt;
+      if (support_cnt > 0) {
+        fallback_op_detail += MakeString(support_cnt, "x ", op, ", ");
+      } else if (unspport_cnt > 0) {
+        normal_op_detail += MakeString(unspport_cnt, "x ", op, ", ");
       }
     }
 
-    LOGS_DEFAULT(VERBOSE) << total_ops << " Ops [" << fb_op_alloc_detail << "] out of " << num_nnapi_ops_
+    LOGS_DEFAULT(VERBOSE) << total_ops << " Ops [" << fallback_op_detail << "] out of " << num_nnapi_ops_
                           << " are falling-back to " << kNnapiCpuDeviceName << ", and ["
-                          << nm_op_alloc_detail << "] are running in accelerators.";
+                          << normal_op_detail << "] are running in accelerators.";
   }
 #endif
   // When calling ANeuralNetworksCompilation_createForDevices,
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h
index 28d858b388..2c3b9cc815 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h
@@ -6,6 +6,7 @@
 #include <unordered_set>
 
 #include "core/common/inlined_containers.h"
+#include "core/common/inlined_containers_fwd.h"
 #include "core/graph/basic_types.h"
 #include "core/providers/nnapi/nnapi_builtin/model.h"
 #include "core/providers/nnapi/nnapi_builtin/nnapi_lib/NeuralNetworksWrapper.h"
@@ -31,7 +32,7 @@ class ModelBuilder {
   using Shape = Shaper::Shape;
 
   ModelBuilder(const GraphViewer& graph_viewer, const NnApi& nnapi_handle,
-               gsl::span<const DeviceWrapper> nnapi_target_devices);
+               gsl::span<const DeviceWrapper> nnapi_target_devices, TargetDeviceOption target_device_option);
 
   common::Status Compile(std::unique_ptr<Model>& model);
 
@@ -104,6 +105,15 @@ class ModelBuilder {
 
   int32_t GetEffectiveFeatureLevel() const { return nnapi_effective_feature_level_; }
 
+  // Just for Debugging
+  void SetDebugTrackNode(const size_t node_index) {
+#ifndef NDEBUG
+    track_node_index_ = node_index;
+#else
+    ORT_UNUSED_PARAMETER(node_index);
+#endif
+  }
+
  private:
   const NnApi& nnapi_;
   const GraphViewer& graph_viewer_;
@@ -147,12 +157,20 @@ class ModelBuilder {
 
   gsl::span<const DeviceWrapper> nnapi_target_devices_;
 
+  const TargetDeviceOption target_device_option_;
   // feature_level, to decide if we can run this node on NNAPI
   int32_t nnapi_effective_feature_level_ = 0;
   // The number of nnapi operations in this model
   size_t num_nnapi_ops_ = 0;
   uint32_t next_index_ = 0;
 
+#ifndef NDEBUG
+  // tracking current node index for debugging
+  size_t track_node_index_ = 0;
+  // recording onnx node index and nnapi operation index.
+  // An onnx node might be decomposed into multiple nnapi operations
+  InlinedVector<std::pair<size_t, int32_t>> operations_recorder_;
+#endif
   // Convert the onnx model to ANeuralNetworksModel
   common::Status Prepare();
 
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.cc
index 8e58ed00c9..75d3abe982 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.cc
@@ -289,7 +289,7 @@ common::Status NnapiExecutionProvider::Compile(const std::vector<FusedNodeAndGra
     Node& fused_node = fused_node_and_graph.fused_node;
     const onnxruntime::GraphViewer& graph_viewer(fused_node_and_graph.filtered_graph);
 
-    nnapi::ModelBuilder builder(graph_viewer, *nnapi_handle_, nnapi_target_devices_);
+    nnapi::ModelBuilder builder(graph_viewer, *nnapi_handle_, nnapi_target_devices_, target_device_option_);
     builder.SetUseNCHW(nnapi_flags_ & NNAPI_FLAG_USE_NCHW);
     builder.SetUseFp16(nnapi_flags_ & NNAPI_FLAG_USE_FP16);