From 30bb0959dc55b823bad9a7d774a8fca85c9358e6 Mon Sep 17 00:00:00 2001
From: Rachel Guo <35738743+YUNQIUGUO@users.noreply.github.com>
Date: Tue, 20 Jun 2023 11:09:00 -0700
Subject: [PATCH] [NNAPI EP] Add ReduceMean Op support (#16294)

### Description
<!-- Describe your changes. -->

As title.

Special cases for ReduceMean:
[UPDATE] The following cases are supported now by converting to
providing an input with all axes for NNAPI.
Behaviors when axes is not provided or axes provided as an empty vector:
For ReduceMean Opset version 18:
- Support case `axes` is provided as empty with `noop_with_empty_axes`
set to true.
- Support case `axes` is not provided with `noop_with_empty_axes` set to
true.
All treat as identity op.
- Does not support the case when `axes` is not provided/provided as
empty but `noop_with_empty_axes` is set to false.

For ReduceMean OpSet Version 13-:
- Does not support when `axes` attribute is not provided. (as onnx
treats it as default behavior to reduce all dimensions, and the case is
not implemented by NNAPI.)


https://developer.android.com/ndk/reference/group/neural-networks#group___neural_networks_1ggaabbe492c60331b13038e39d4207940e0a047fe95a35b27f45c05432b6ca18eb6c

> 1: A 1-D Tensor of
[ANEURALNETWORKS_TENSOR_INT32](https://developer.android.com/ndk/reference/group/neural-networks#group___neural_networks_1ggaf06d1affd33f3bc698d0c04eceb23298ac34965d8e76ac5acfddf5acd9e40f896).
The dimensions to reduce. Must be in the range [-rank(input_tensor),
rank(input_tensor)).NOTE: When the operation was introduced, the
documentation incorrectly stated that if dimensions were empty, the
operation would reduce across all dimensions. This behavior was never
implemented.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

Fixes issue #16194

---------

Co-authored-by: rachguo <rachguo@rachguos-Mini.attlocal.net>
---
 .../nnapi/nnapi_builtin/builders/helper.cc    |  14 ++
 .../nnapi/nnapi_builtin/builders/helper.h     |   4 +
 .../builders/impl/reduction_op_builder.cc     | 205 ++++++++++++++++++
 .../builders/impl/transpose_op_builder.cc     |   2 +-
 .../builders/op_builder_factory.cc            |   4 +
 .../builders/op_builder_factory.h             |   1 +
 .../builders/op_builder_helpers.cc            |  11 +-
 7 files changed, 232 insertions(+), 9 deletions(-)
 create mode 100644 onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/reduction_op_builder.cc
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
index 60c7dca222..3209ad734f 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc
@@ -392,5 +392,19 @@ bool CheckIsInitializer(const InitializedTensorSet& initializers, const NodeUnit
   return true;
 }
 
+std::vector<int32_t> OnnxAxesToNnapi(gsl::span<const int64_t> onnx_axes, std::optional<size_t> input_rank) {
+  std::vector<int32_t> result;
+  result.reserve(onnx_axes.size());
+  for (auto dim : onnx_axes) {
+    if (input_rank.has_value()) {
+      dim = HandleNegativeAxis(dim, *input_rank);
+    }
+
+    result.push_back(narrow<int32_t>(dim));
+  }
+
+  return result;
+}
+
 }  // namespace nnapi
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
index 5241ebb1b7..421c55a2c9 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.h
@@ -172,5 +172,9 @@ inline uint32_t ShapeSize(const Shape& shape) {
 bool CheckIsInitializer(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
                         const std::string& input_name, const char* input_description);
 
+// Convert ONNX int64 input to NNAPI int32 type input and optionally handle negative axis if needed
+// Mostly used in handling `axes` input for now
+std::vector<int32_t> OnnxAxesToNnapi(gsl::span<const int64_t> onnx_axes, std::optional<size_t> input_rank = std::nullopt);
+
 }  // namespace nnapi
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/reduction_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/reduction_op_builder.cc
new file mode 100644
index 0000000000..618779f6d2
--- /dev/null
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/reduction_op_builder.cc
@@ -0,0 +1,205 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <onnx/onnx_pb.h>
+
+#include "core/common/logging/logging.h"
+#include "core/common/safeint.h"
+#include "core/framework/tensorprotoutils.h"
+#include "core/graph/graph_viewer.h"
+#include "core/optimizer/initializer.h"
+#include "core/providers/common.h"
+#include "core/providers/shared/utils/utils.h"
+#include "core/providers/nnapi/nnapi_builtin/builders/helper.h"
+#include "core/providers/nnapi/nnapi_builtin/builders/model_builder.h"
+#include "core/providers/nnapi/nnapi_builtin/builders/op_builder_factory.h"
+#include "core/providers/nnapi/nnapi_builtin/builders/op_builder_helpers.h"
+#include "core/providers/nnapi/nnapi_builtin/builders/impl/base_op_builder.h"
+
+using namespace android::nn::wrapper;
+
+namespace onnxruntime {
+namespace nnapi {
+
+using namespace op_builder_helpers;
+
+class ReductionOpBuilder : public BaseOpBuilder {
+  // Add operator related
+ public:
+  void AddInitializersToSkip(ModelBuilder& model_builder, const NodeUnit& node_unit) const override;
+
+ private:
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const override;
+
+  // Operator support related
+ private:
+  int32_t GetMinSupportedNNAPIFeatureLevel(const NodeUnit& node_unit,
+                                           const OpSupportCheckParams& params) const override;
+  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+                         const OpSupportCheckParams& params) const override;
+};
+
+// Add operator related
+
+void ReductionOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
+  const auto& inputs = node_unit.Inputs();
+  if (inputs.size() > 1 && inputs[1].node_arg.Exists()) {
+    model_builder.AddInitializerToSkip(inputs[1].node_arg.Name());
+  }
+}
+
+Status ReductionOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
+  const auto& op_type(node_unit.OpType());
+  const auto& inputs = node_unit.Inputs();
+  const auto& output = node_unit.Outputs()[0].node_arg.Name();
+
+  auto& shaper(model_builder.GetShaper());
+  const auto input_shape = shaper[inputs[0].node_arg.Name()];
+  const auto& operand_indices(model_builder.GetOperandIndices());
+  const auto& operand_types(model_builder.GetOperandTypes());
+
+  NodeAttrHelper helper(node_unit);
+
+  int32_t op_code;
+  if (op_type == "ReduceMean") {
+    op_code = ANEURALNETWORKS_MEAN;
+  } else {
+    // TODO: Add more reduction ops support
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "ReductionOpBuilder, unknown op: ", op_type);
+  }
+
+  const bool keepdims = helper.Get("keepdims", 1) != 0;
+  const bool noop_with_empty_axes = helper.Get("noop_with_empty_axes", 0) != 0;
+
+  // Get axes for ReduceMean
+  // Note: ONNX `ReduceMean` will reduce by default all dimensions if axes is not provided/provided as empty. However, NNAPI doesn't implement the behavior
+  // to reduce all dimensions by default when 'axes' is empty/not provided. We will convert the case by providing an input with all axes for NNAPI here.
+  // Notes from NNAPI doc:
+  // https://developer.android.com/ndk/reference/group/neural-networks#group___neural_networks_1ggaabbe492c60331b13038e39d4207940e0a047fe95a35b27f45c05432b6ca18eb6c
+  std::vector<int32_t> axes;
+  if (node_unit.SinceVersion() >= 18) {
+    if (inputs.size() > 1 && inputs[1].node_arg.Exists()) {
+      // ReduceMean-18 uses the second optional input as axes
+      const auto& initializers(model_builder.GetInitializerTensors());
+      const auto& axes_tensor = *initializers.at(inputs[1].node_arg.Name());
+      Initializer unpacked_tensor(axes_tensor);
+      auto raw_axes = unpacked_tensor.DataAsSpan<int64_t>();
+      axes = OnnxAxesToNnapi(raw_axes, input_shape.size());
+    }
+  } else {
+    // For ReduceMean-13 or earlier, retrieve axes from the attribute
+    const auto axes_int64 = helper.Get("axes", std::vector<int64_t>{});
+    axes = OnnxAxesToNnapi(axes_int64, input_shape.size());
+  }
+
+  if (axes.empty() && !noop_with_empty_axes) {
+    // we provide an input with all axes for NNAPI here to simulate this default behavior to reduce all dimensions
+    axes.resize(input_shape.size());
+    std::iota(axes.begin(), axes.end(), 0);
+  }
+
+  // Add ReduceMean operation
+  InlinedVector<uint32_t> input_indices;
+  input_indices.push_back(operand_indices.at(inputs[0].node_arg.Name()));  // data
+
+  if (!axes.empty()) {
+    const auto axes_name = model_builder.GetUniqueName(node_unit.Name() + inputs[0].node_arg.Name() + "_axes");
+    Shape axes_dimen = {static_cast<uint32_t>(axes.size())};
+    const OperandType axes_operand_type(Type::TENSOR_INT32, axes_dimen);
+    ORT_RETURN_IF_ERROR(model_builder.AddOperandFromPersistMemoryBuffer(axes_name, axes.data(), axes_operand_type));
+
+    input_indices.push_back(operand_indices.at(axes_name));  // axes
+
+    int32_t input_size = static_cast<int32_t>(input_shape.size());
+
+    // Make output dimensions
+    InlinedVector<uint32_t> output_dimen;
+    if (keepdims) {
+      output_dimen.reserve(input_size);
+    } else {
+      output_dimen.reserve(input_size - axes.size());
+    }
+
+    for (int32_t i = 0; i < input_size; i++) {
+      if (std::find(axes.begin(), axes.end(), i) == axes.end()) {
+        output_dimen.push_back(input_shape[i]);
+      } else {
+        if (keepdims) {
+          output_dimen.push_back(1);
+        }
+      }
+    }
+
+    // In case of a tensor has all 1's in dimension such as {1,1,1,1} and gets all reduced,
+    // NNAPI requires the output shape to be {1}. (otherwise NNAPI will treat it as dynamic shape.)
+    if (output_dimen.empty())
+      output_dimen.push_back(1);
+
+    shaper.AddShape(output, output_dimen);
+
+    ADD_SCALAR_OPERAND(model_builder, input_indices, keepdims ? 1 : 0);
+
+    const OperandType output_operand_type(operand_types.at(inputs[0].node_arg.Name()).type, output_dimen);
+    ORT_RETURN_IF_ERROR(model_builder.AddOperation(op_code, input_indices,
+                                                   {output}, {output_operand_type}));
+  } else {
+    // If `axes` is still empty at this point, meaning that it's ReduceMean-18 and attribute `noop_with_empty_axes` specifies as 1,
+    // treat as an Identity op here.
+    const OperandType output_operand_type(operand_types.at(inputs[0].node_arg.Name()).type, input_shape);
+    model_builder.RegisterOperand(output, operand_indices.at(inputs[0].node_arg.Name()), output_operand_type);
+  }
+
+  return Status::OK();
+}
+
+// Operator support related
+
+int32_t ReductionOpBuilder::GetMinSupportedNNAPIFeatureLevel(
+    const NodeUnit& node_unit, const OpSupportCheckParams& /* params */) const {
+  const auto& op(node_unit.OpType());
+  if (op == "ReduceMean") {
+    return ANEURALNETWORKS_FEATURE_LEVEL_2;
+  }
+
+  return ANEURALNETWORKS_FEATURE_LEVEL_3;
+}
+
+bool ReductionOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers, const NodeUnit& node_unit,
+                                           const OpSupportCheckParams& /* params */) const {
+  const auto& inputs = node_unit.Inputs();
+  const auto& op(node_unit.OpType());
+
+  Shape input_shape;
+  if (!GetShape(inputs[0].node_arg, input_shape))
+    return false;
+
+  if (input_shape.size() > 4 || input_shape.empty()) {
+    LOGS_DEFAULT(VERBOSE) << "NNAPI reduction ops only support 1-4d shape, input is "
+                          << input_shape.size() << "d shape";
+    return false;
+  }
+
+  if (op == "ReduceMean") {
+    if (inputs.size() > 1 && inputs[1].node_arg.Exists()) {
+      const auto& axes_name = inputs[1].node_arg.Name();
+      if (!Contains(initializers, axes_name)) {
+        LOGS_DEFAULT(VERBOSE) << "Axes of ReduceMean must be a constant initializer.";
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+void CreateReductionOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) {
+  CreateSharedOpBuilderImpl<ReductionOpBuilder>(
+      op_type, op_registrations,
+      {
+          // TODO: Add more reduction ops support
+          "ReduceMean",
+      });
+}
+
+}  // namespace nnapi
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/transpose_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/transpose_op_builder.cc
index 898c3f1872..4d243c730b 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/transpose_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/transpose_op_builder.cc
@@ -64,7 +64,7 @@ Status TransposeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, co
   const auto& input = node_unit.Inputs()[0].node_arg.Name();
   const auto& output = node_unit.Outputs()[0].node_arg.Name();
   NodeAttrHelper helper(node_unit);
-  std::vector<int32_t> perm = helper.Get("perm", std::vector<int32_t>());
+  std::vector<int32_t> perm = helper.Get("perm", std::vector<int32_t>{});
   auto input_dims = static_cast<int32_t>(shaper[input].size());
   if (perm.empty()) {
     for (int32_t i = input_dims - 1; i >= 0; i--)
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder_factory.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder_factory.cc
index 0bcf84e6bf..3cf2fcb337 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder_factory.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder_factory.cc
@@ -85,6 +85,10 @@ static OpBuilderRegistrations CreateOpBuilderRegistrations() {
     CreateMinMaxOpBuilder("Min", op_registrations);
   }
 
+  {
+    CreateReductionOpBuilder("ReduceMean", op_registrations);
+  }
+
   return op_registrations;
 }
 
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder_factory.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder_factory.h
index 37dfc003b6..41df542437 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder_factory.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder_factory.h
@@ -43,6 +43,7 @@ void CreateGemmOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_
 void CreatePoolOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateMinMaxOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateUnaryOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
+void CreateReductionOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 
 }  // namespace nnapi
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder_helpers.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder_helpers.cc
index 9832338c0c..cdcf2e4bd2 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder_helpers.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder_helpers.cc
@@ -919,22 +919,17 @@ Status GetAxesForSqueezeAndUnSqueeze(ModelBuilder& model_builder, const NodeUnit
   if (node_unit.SinceVersion() > 12) {
     // For squeeze, axes is an optional input.If it is not supplied, return an empty axes as default to squeeze all
     // For unsqueeze, axes is a required input. This check has no effect for it
-    // TODO: Add helper function to handle the following conversion from int64 initializer to int32
     if (node_unit.Inputs().size() > 1) {
       const auto& initializers(model_builder.GetInitializerTensors());
       const auto& axes_tensor = *initializers.at(node_unit.Inputs()[1].node_arg.Name());
       Initializer unpacked_tensor(axes_tensor);
       auto raw_axes = unpacked_tensor.DataAsSpan<int64_t>();
-      const auto size = SafeInt<uint32_t>(axes_tensor.dims()[0]);
-      axes.resize(size);
-      for (uint32_t i = 0; i < size; i++) {
-        // it is unlikely we have an axis value overflow for int32
-        axes[i] = static_cast<int32_t>(raw_axes[i]);
-      }
+      axes = OnnxAxesToNnapi(raw_axes, std::nullopt);
     }
   } else {
     NodeAttrHelper helper(node_unit);
-    axes = helper.Get("axes", std::vector<int32_t>());
+    const auto axes_int64 = helper.Get("axes", std::vector<int64_t>{});
+    axes = OnnxAxesToNnapi(axes_int64, std::nullopt);
   }
 
   return Status::OK();