[NNAPI] Support non-1d tensor for C of Gemm op (#5982)

* Add support for non-1d tensor for C of Gemm * check android api level before add squeeze * Minor update * Fix to accept c only in format of {1,1,...,1,n}
2026-07-17 18:40:28 +00:00 · 2020-12-02 00:22:38 -08:00 · 2020-12-02 00:22:38 -08:00 · cdacee6696
commit cdacee6696
parent 6846c665ff
3 changed files with 143 additions and 56 deletions
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
@ -179,6 +179,58 @@ static Status AddBinaryOperator(int32_t op_type,
  return Status::OK();
 }

+static Status AddSqueezeOp(ModelBuilder& model_builder,
+                           const std::string& node_name,
+                           const std::string& input, const std::string& output,
+                           vector<int32_t> axes) ORT_MUST_USE_RESULT;
+static Status AddSqueezeOp(ModelBuilder& model_builder,
+                           const std::string& node_name,
+                           const std::string& input, const std::string& output,
+                           vector<int32_t> axes) {
+  if (model_builder.GetAndroidSdkVer() < 28) {
+    return ORT_MAKE_STATUS(
+        ONNXRUNTIME, FAIL, "Squeeze is not supported on API level ", model_builder.GetAndroidSdkVer());
+  }
+
+  auto& shaper(model_builder.GetShaper());
+  const auto& operand_indices(model_builder.GetOperandIndices());
+  const auto& operand_types(model_builder.GetOperandTypes());
+
+  const auto& input_shape(shaper[input]);
+  auto input_dims = input_shape.size();
+  for (auto& axis : axes) {
+    axis = static_cast<int32_t>(HandleNegativeAxis(axis, input_dims));
+  }
+
+  // Despite the spec of ANEURALNETWORKS_SQUEEZE at
+  // https://developer.android.com/ndk/reference/group/neural-networks
+  // states, that the axes (input 1 of ANEURALNETWORKS_SQUEEZE) is optional.
+  //
+  // The actual code of NNAPI requires the axes to be provided
+  // https://android.googlesource.com/platform/frameworks/ml/+/master/nn/common/operations/Squeeze.cpp#31
+  if (axes.empty()) {  // Squeeze all
+    for (size_t i = 0; i < input_dims; i++) {
+      if (input_shape[i] == 1)
+        axes.push_back(i);
+    }
+  }
+
+  const auto axes_name = model_builder.GetUniqueName(node_name + input + "_axes");
+  Shape axes_dimen = {static_cast<uint32_t>(axes.size())};
+  const OperandType axes_operand_type(Type::TENSOR_INT32, axes_dimen);
+  ORT_RETURN_IF_ERROR(model_builder.AddOperandFromPersistMemoryBuffer(axes_name, axes.data(), axes_operand_type));
+
+  std::vector<uint32_t> input_indices;
+  input_indices.push_back(operand_indices.at(input));      // input
+  input_indices.push_back(operand_indices.at(axes_name));  // axes
+
+  ORT_RETURN_IF_ERROR(shaper.Squeeze(input, axes, output));
+  const OperandType output_operand_type(operand_types.at(input).type, shaper[output]);
+  ORT_RETURN_IF_ERROR(model_builder.AddOperation(ANEURALNETWORKS_SQUEEZE, input_indices,
+                                                 {output}, {output_operand_type}, {false}));
+  return Status::OK();
+}
+
 enum DataLayout {
  L_0231 = 0,
  L_1230 = 1,
@ -1581,12 +1633,27 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
  uint32_t bias_idx;
  bool has_bias = (op == "Gemm") && (input_defs.size() > 2);
  if (has_bias) {
-    bias_idx = operand_indices.at(input_defs[c_idx]->Name());
+    const auto& bias = input_defs[c_idx]->Name();
+    // We need squeeze the input tensor to 1d if necessary
+    if (shaper[bias].size() > 1) {
+      std::string bias_squeezed = model_builder.GetUniqueName(node.Name() + op + "_bias_squeezed");
+      // We will use squeeze all here
+      ORT_RETURN_IF_ERROR(AddSqueezeOp(model_builder, node.Name(),
+                                       bias, bias_squeezed,
+                                       {} /* axes */));
+      bias_idx = operand_indices.at(bias_squeezed);
+      LOGS_DEFAULT(VERBOSE) << "GemmOpBuilder - Operand [" << bias << "] squeezed from "
+                            << Shape2String(shaper[bias])
+                            << " to "
+                            << Shape2String(shaper[bias_squeezed]);
+    } else {
+      bias_idx = operand_indices.at(bias);
+    }
  } else {
    // No C supplied, we need a vector of 0
-    std::string bias = node.Name() + op + "_bias";
+    std::string bias = model_builder.GetUniqueName(node.Name() + op + "_bias");
    const auto& bias_type = operand_types.at(input2).type;
-    Shape bias_dimen = {shaper[input2][0]};
+    const Shape& bias_dimen = {shaper[input2][0]};
    if (bias_type == Type::TENSOR_FLOAT32) {
      std::vector<float> buffer(bias_dimen[0], 0.f);
      OperandType bias_operand_type(Type::TENSOR_FLOAT32, bias_dimen);
@ -1767,10 +1834,6 @@ Status ConcatOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
 class SqueezeOpBuilder : public BaseOpBuilder {
 public:
  void AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const override;
-  static Status AddSqueezeOp(ModelBuilder& model_builder,
-                             const std::string& node_name,
-                             const std::string& input, const std::string& output,
-                             vector<int32_t> axes) ORT_MUST_USE_RESULT;

 private:
  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node) const override ORT_MUST_USE_RESULT;
@ -1783,49 +1846,6 @@ void SqueezeOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const
  }
 }

-/* static */ Status SqueezeOpBuilder::AddSqueezeOp(ModelBuilder& model_builder,
-                                                   const std::string& node_name,
-                                                   const std::string& input, const std::string& output,
-                                                   vector<int32_t> axes) {
-  auto& shaper(model_builder.GetShaper());
-  const auto& operand_indices(model_builder.GetOperandIndices());
-  const auto& operand_types(model_builder.GetOperandTypes());
-
-  const auto& input_shape(shaper[input]);
-  auto input_dims = input_shape.size();
-  for (auto& axis : axes) {
-    axis = static_cast<int32_t>(HandleNegativeAxis(axis, input_dims));
-  }
-
-  // Despite the spec of ANEURALNETWORKS_SQUEEZE at
-  // https://developer.android.com/ndk/reference/group/neural-networks
-  // states, that the axes (input 1 of ANEURALNETWORKS_SQUEEZE) is optional.
-  //
-  // The actual code of NNAPI requires the axes to be provided
-  // https://android.googlesource.com/platform/frameworks/ml/+/master/nn/common/operations/Squeeze.cpp#31
-  if (axes.empty()) {  // Squeeze all
-    for (size_t i = 0; i < input_dims; i++) {
-      if (input_shape[i] == 1)
-        axes.push_back(i);
-    }
-  }
-
-  const auto axes_name = model_builder.GetUniqueName(node_name + input + "_axes");
-  Shape axes_dimen = {static_cast<uint32_t>(axes.size())};
-  const OperandType axes_operand_type(Type::TENSOR_INT32, axes_dimen);
-  ORT_RETURN_IF_ERROR(model_builder.AddOperandFromPersistMemoryBuffer(axes_name, axes.data(), axes_operand_type));
-
-  std::vector<uint32_t> input_indices;
-  input_indices.push_back(operand_indices.at(input));      // input
-  input_indices.push_back(operand_indices.at(axes_name));  // axes
-
-  ORT_RETURN_IF_ERROR(shaper.Squeeze(input, axes, output));
-  const OperandType output_operand_type(operand_types.at(input).type, shaper[output]);
-  ORT_RETURN_IF_ERROR(model_builder.AddOperation(ANEURALNETWORKS_SQUEEZE, input_indices,
-                                                 {output}, {output_operand_type}, {false}));
-  return Status::OK();
-}
-
 /* static */ vector<int32_t> SqueezeOpBuilder::GetAxes(ModelBuilder& model_builder, const Node& node) {
  vector<int32_t> axes;
  // Squeeze opset 13 use input as axes
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc
@ -719,6 +719,39 @@ bool GemmOpSupportChecker::HasSupportedInputsImpl(const Node& node) const {
      });
 }

+// Get the bias size (C) of Gemm op
+// ANEURALNETWORKS_FULLY_CONNECTED only supports 1d bias
+// Will test if C of Gemm can be squeezed and return the 1d vector size after squeeze
+static bool GetBiasSize(const Shape& c_shape, int32_t android_sdk_ver, uint32_t& size) {
+  // TODO add support of scalar C for Gemm
+  size_t c_dim = c_shape.size();
+  if (c_dim == 0) {
+    LOGS_DEFAULT(VERBOSE) << "C of Gemm cannot be a scalar";
+    return false;
+  }
+
+  if (c_dim != 1 && android_sdk_ver < 28) {
+    LOGS_DEFAULT(VERBOSE) << "C of Gemm can only be 1d tensor for API level " << android_sdk_ver
+                          << " shape of C, " << Shape2String(c_shape);
+    return false;
+  }
+
+  if (c_dim != 1) {
+    // If C is a (2+)d tensor, it must have the format {1, 1, ..., 1, n}
+    // where every except the last dimension should be 1
+    for (size_t i = 0; i < c_dim - 1; ++i) {
+      if (c_shape[i] != 1) {
+        LOGS_DEFAULT(VERBOSE) << "C of Gemm must be a vector or a tensor with only last dimension != 1"
+                              << " c_shape: " << Shape2String(c_shape);
+        return false;
+      }
+    }
+  }
+
+  size = c_shape[c_dim - 1];
+  return true;
+}
+
 int GemmOpSupportChecker::GetMinSupportedOpSet(const Node& node) const {
  const auto& op(node.OpType());

@ -730,7 +763,7 @@ int GemmOpSupportChecker::GetMinSupportedOpSet(const Node& node) const {
 }

 bool GemmOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
-                                             const OpSupportCheckParams& /* params */) const {
+                                             const OpSupportCheckParams& params) const {
  const auto& op_type = node.OpType();
  const auto input_defs(node.InputDefs());
  size_t a_idx = 0, b_idx = 1, c_idx = 2;  // A*B+C
@ -774,7 +807,11 @@ bool GemmOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initial

    if (!(transA == 0 && alpha == 1.f && beta == 1.f)) {
      LOGS_DEFAULT(VERBOSE) << "Only transA == 0, alpha == 1.0 "
-                            << "and beta == 1.0 is supported.";
+                            << "and beta == 1.0 is supported."
+                            << " transA " << transA
+                            << " transB " << transB
+                            << " alpha " << alpha
+                            << " beta " << beta;
      return false;
    }

@ -788,9 +825,13 @@ bool GemmOpSupportChecker::IsOpSupportedImpl(const InitializedTensorSet& initial
      if (!GetShape(*input_defs[c_idx], c_shape))
        return false;

-      if (c_shape.size() != 1 ||
-          c_shape[0] != (transB == 0 ? b_shape[1] : b_shape[0])) {
-        LOGS_DEFAULT(VERBOSE) << "C of Gemm must be a vector of b_shape[0]"
+      uint32_t c_size;
+      if (!GetBiasSize(c_shape, params.android_sdk_ver, c_size))
+        return false;
+
+      if (c_size != (transB == 0 ? b_shape[1] : b_shape[0])) {
+        LOGS_DEFAULT(VERBOSE) << "C of Gemm must be a vector of b_shape["
+                              << (transB == 0 ? "1" : "0") << "]"
                              << " b_shape: " << Shape2String(b_shape)
                              << " c_shape: " << Shape2String(c_shape);

--- a/onnxruntime/test/providers/cpu/math/gemm_test.cc
+++ b/onnxruntime/test/providers/cpu/math/gemm_test.cc
@ -140,6 +140,7 @@ TEST(GemmOpTest, GemmTransBIsInitializer) {
 }

 // NNAPI EP's GEMM only works as A*B', add case only B is transposed
+// Also test NNAPI EP's handling of non-1D bias (C of Gemm)
 TEST(GemmOpTest, GemmTransB) {
  OpTester test("Gemm");

@ -152,7 +153,32 @@ TEST(GemmOpTest, GemmTransB) {
                       {1.0f, 2.0f, 3.0f, 4.0f,
                        -1.0f, -2.0f, -3.0f, -4.0f});
  test.AddInput<float>("B", {3, 4}, std::vector<float>(12, 1.0f));
-  test.AddInput<float>("C", {3}, std::vector<float>(3, 1.0f));
+  test.AddInput<float>("C", {1, 3}, std::vector<float>(3, 1.0f));
+  test.AddOutput<float>("Y", {2, 3},
+                        {11.0f, 11.0f, 11.0f,
+                         -9.0f, -9.0f, -9.0f});
+#if defined(OPENVINO_CONFIG_GPU_FP16) || defined(OPENVINO_CONFIG_GPU_FP32)
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider});  // OpenVINO: Temporarily disabled due to accuracy issues
+#else
+  test.Run();
+#endif
+}
+
+// NNAPI EP's GEMM only works as A*B', add case only B is transposed
+// Also test NNAPI EP's handling of non-1D bias (C of Gemm) which is broadcastable but not valid for NNAPI
+TEST(GemmOpTest, GemmTransB_1) {
+  OpTester test("Gemm");
+
+  test.AddAttribute("transA", (int64_t)0);
+  test.AddAttribute("transB", (int64_t)1);
+  test.AddAttribute("alpha", 1.0f);
+  test.AddAttribute("beta", 1.0f);
+
+  test.AddInput<float>("A", {2, 4},
+                       {1.0f, 2.0f, 3.0f, 4.0f,
+                        -1.0f, -2.0f, -3.0f, -4.0f});
+  test.AddInput<float>("B", {3, 4}, std::vector<float>(12, 1.0f));
+  test.AddInput<float>("C", {2, 1}, std::vector<float>(2, 1.0f));
  test.AddOutput<float>("Y", {2, 3},
                        {11.0f, 11.0f, 11.0f,
                         -9.0f, -9.0f, -9.0f});