From bf78e4d18b2f9a1ed6a91baa93f9d9d43578c832 Mon Sep 17 00:00:00 2001
From: Hariharan Seshadri <shariharan91@gmail.com>
Date: Tue, 21 Jul 2020 17:57:47 -0700
Subject: [PATCH] Handle cases which produce an empty output in the MatMul op
 family (#4573)

---
 .../contrib_ops/cpu/matmul_integer16.cc       |   4 +
 .../quantization/dynamic_quantize_matmul.cc   |   4 +
 .../contrib_ops/cpu/transpose_matmul.cc       |   4 +
 onnxruntime/core/providers/cpu/math/matmul.cc |   4 +
 .../core/providers/cpu/math/matmul_integer.cc |   4 +
 .../cpu/math/quantize_linear_matmul.cc        |   4 +
 .../core/providers/cuda/math/matmul.cc        |   4 +
 .../providers/cuda/math/matmul_integer.cc     |   4 +
 .../dynamic_quantize_matmul_test.cc           |   7 ++
 .../test/contrib_ops/matmul_integer16_test.cc |  14 ++-
 .../contrib_ops/transpose_matmul_op_test.cc   |   7 ++
 .../providers/cpu/math/matmul_integer_test.cc |   9 ++
 .../test/providers/cpu/math/matmul_test.cc    | 105 +++++++++---------
 .../cpu/math/quantize_linear_matmul_test.cc   |  38 +++++--
 14 files changed, 148 insertions(+), 64 deletions(-)
diff --git a/onnxruntime/contrib_ops/cpu/matmul_integer16.cc b/onnxruntime/contrib_ops/cpu/matmul_integer16.cc
index 7378cd5651..d94914cf41 100644
--- a/onnxruntime/contrib_ops/cpu/matmul_integer16.cc
+++ b/onnxruntime/contrib_ops/cpu/matmul_integer16.cc
@@ -28,6 +28,10 @@ Status MatMulInteger16<int16_t, int16_t, int32_t>::Compute(OpKernelContext* ctx)
   ORT_RETURN_IF_ERROR(helper.Compute(A->Shape(), B->Shape()));
   Tensor* Y = ctx->Output(0, helper.OutputShape());
 
+  // Bail out early if the output is going to be empty
+  if (Y->Shape().Size() == 0)
+    return Status::OK();
+
   for (int i = 0; i < static_cast<int>(helper.OutputOffsets().size()); i++) {
     EigenCastGEMM<int16_t, int16_t, int32_t>(
         A->template Data<int16_t>() + helper.LeftOffsets()[i],
diff --git a/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc b/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc
index 7a6b3ae936..b36115a66b 100644
--- a/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc
@@ -83,6 +83,10 @@ Status MatMulIntegerToFloatBase::ComputeCommon(OpKernelContext* ctx,
   ORT_RETURN_IF_ERROR(helper.Compute(a_shape, b->Shape()));
   Tensor* y = ctx->Output(0, helper.OutputShape());
 
+  // Bail out early if the output is going to be empty
+  if (y->Shape().Size() == 0)
+    return Status::OK();
+
   const auto* b_data = static_cast<const uint8_t*>(b->DataRaw());
   const bool b_is_signed = b->IsDataType<int8_t>();
   auto* y_data = y->template MutableData<float>();
diff --git a/onnxruntime/contrib_ops/cpu/transpose_matmul.cc b/onnxruntime/contrib_ops/cpu/transpose_matmul.cc
index 10b3e11588..fcf5d3b1c4 100644
--- a/onnxruntime/contrib_ops/cpu/transpose_matmul.cc
+++ b/onnxruntime/contrib_ops/cpu/transpose_matmul.cc
@@ -37,6 +37,10 @@ Status TransposeMatMul::Compute(OpKernelContext* context) const {
 
   Tensor* Y = context->Output(0, helper.OutputShape());
 
+  // Bail out early if the output is going to be empty
+  if (Y->Shape().Size() == 0)
+    return Status::OK();
+
   const size_t num_offsets = helper.OutputOffsets().size();
   for (size_t i = 0; i < num_offsets; ++i) {
     math::Gemm<float, concurrency::ThreadPool>(
diff --git a/onnxruntime/core/providers/cpu/math/matmul.cc b/onnxruntime/core/providers/cpu/math/matmul.cc
index 6a66fd609f..66762b5b35 100644
--- a/onnxruntime/core/providers/cpu/math/matmul.cc
+++ b/onnxruntime/core/providers/cpu/math/matmul.cc
@@ -72,6 +72,10 @@ Status MatMul<T>::Compute(OpKernelContext* ctx) const {
   ORT_RETURN_IF_ERROR(helper.Compute(a->Shape(), b->Shape()));
   Tensor* y = ctx->Output(0, helper.OutputShape());
 
+  // Bail out early if the output is going to be empty
+  if (y->Shape().Size() == 0)
+    return Status::OK();
+
   // Using DataRaw as int32_t/uint32_t and int64_t/uint64_t share a common
   // operator body.
   const auto* a_data = reinterpret_cast<const T*>(a->DataRaw());
diff --git a/onnxruntime/core/providers/cpu/math/matmul_integer.cc b/onnxruntime/core/providers/cpu/math/matmul_integer.cc
index b1a23c317f..a36d3dd254 100644
--- a/onnxruntime/core/providers/cpu/math/matmul_integer.cc
+++ b/onnxruntime/core/providers/cpu/math/matmul_integer.cc
@@ -38,6 +38,10 @@ Status MatMulInteger::Compute(OpKernelContext* ctx) const {
   ORT_RETURN_IF_ERROR(helper.Compute(a->Shape(), b->Shape()));
   Tensor* y = ctx->Output(0, helper.OutputShape());
 
+  // Bail out early if the output is going to be empty
+  if (y->Shape().Size() == 0)
+    return Status::OK();
+
   // validate zero points
   uint8_t a_offset = 0;
   uint8_t b_offset = 0;
diff --git a/onnxruntime/core/providers/cpu/math/quantize_linear_matmul.cc b/onnxruntime/core/providers/cpu/math/quantize_linear_matmul.cc
index 9d376f606c..faf71df08c 100644
--- a/onnxruntime/core/providers/cpu/math/quantize_linear_matmul.cc
+++ b/onnxruntime/core/providers/cpu/math/quantize_linear_matmul.cc
@@ -38,6 +38,10 @@ Status QLinearMatMul::Compute(OpKernelContext* ctx) const {
   ORT_RETURN_IF_ERROR(helper.Compute(a->Shape(), b->Shape()));
   Tensor* y = ctx->Output(0, helper.OutputShape());
 
+  // Bail out early if the output is going to be empty
+  if (y->Shape().Size() == 0)
+    return Status::OK();
+
   // validate offsets
   const auto* a_offset = ctx->Input<Tensor>(2);
   const auto* b_offset = ctx->Input<Tensor>(5);
diff --git a/onnxruntime/core/providers/cuda/math/matmul.cc b/onnxruntime/core/providers/cuda/math/matmul.cc
index f32f1f52b4..bec6cdb364 100644
--- a/onnxruntime/core/providers/cuda/math/matmul.cc
+++ b/onnxruntime/core/providers/cuda/math/matmul.cc
@@ -92,6 +92,10 @@ Status MatMul<T>::ComputeInternal(OpKernelContext* ctx) const {
 
   Tensor* Y = ctx->Output(0, helper.OutputShape());
 
+  // Bail out early if the output is going to be empty
+  if (Y->Shape().Size() == 0)
+    return Status::OK();
+
   CudaT one = ToCudaType<T>::FromFloat(1.0f);
   CudaT zero = ToCudaType<T>::FromFloat(0.0f);
 
diff --git a/onnxruntime/core/providers/cuda/math/matmul_integer.cc b/onnxruntime/core/providers/cuda/math/matmul_integer.cc
index be0cb953af..5c7cb81320 100644
--- a/onnxruntime/core/providers/cuda/math/matmul_integer.cc
+++ b/onnxruntime/core/providers/cuda/math/matmul_integer.cc
@@ -36,6 +36,10 @@ Status MatMulInteger<int8_t, int8_t>::ComputeInternal(OpKernelContext* ctx) cons
   ORT_RETURN_IF_ERROR(helper.Compute(a->Shape(), b->Shape()));
   Tensor* Y = ctx->Output(0, helper.OutputShape());
 
+  // Bail out early if the output is going to be empty
+  if (Y->Shape().Size() == 0)
+    return Status::OK();
+
   const int8_t* a_ptr = a->template Data<int8_t>();
   const int8_t* b_ptr = b->template Data<int8_t>();
   int32_t* output_ptr = Y->template MutableData<int32_t>();
diff --git a/onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc b/onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc
index dcc8fe7697..516f4c9f29 100644
--- a/onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc
+++ b/onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc
@@ -89,6 +89,13 @@ TEST(DynamicQuantizeMatMul, UInt8_test) {
   TestDynamicQuantizeMatMul<uint8_t>(A_dims, B_dims, "testdata/dynamic_quantize_matmul_uint8.onnx");
 }
 
+TEST(DynamicQuantizeMatMul, UInt8_test_with_empty_input) {
+  std::vector<int64_t> A_dims{0, 128};
+  std::vector<int64_t> B_dims{128, 128};
+  std::vector<int64_t> Y_dims{0, 128};
+
+  TestDynamicQuantizeMatMul<uint8_t>(A_dims, B_dims, "testdata/dynamic_quantize_matmul_uint8.onnx");
+}
 TEST(DynamicQuantizeMatMul, UInt8_test_bias) {
   std::vector<int64_t> A_dims{4, 128};
   std::vector<int64_t> B_dims{128, 128};
diff --git a/onnxruntime/test/contrib_ops/matmul_integer16_test.cc b/onnxruntime/test/contrib_ops/matmul_integer16_test.cc
index 6c4cc23960..a1032bc157 100644
--- a/onnxruntime/test/contrib_ops/matmul_integer16_test.cc
+++ b/onnxruntime/test/contrib_ops/matmul_integer16_test.cc
@@ -27,13 +27,21 @@ TEST(MatmulInteger16OpTest, MatMulInteger16_2) {
   test.Run();
 }
 
+TEST(MatmulInteger16OpTest, MatMulInteger16_Empty_input) {
+  OpTester test("MatMulInteger16", 1, onnxruntime::kMSDomain);
+  test.AddInput<int16_t>("T1", {0, 2}, {});
+  test.AddInput<int16_t>("T2", {2, 1}, {-8, -11});
+  test.AddOutput<int32_t>("T3", {0, 1}, {});
+  test.Run();
+}
 TEST(MatmulInteger16OpTest, MatMulInteger16_3) {
   OpTester test("MatMulInteger16", 1, onnxruntime::kMSDomain);
   test.AddInput<int16_t>("T1", {3, 2}, {-7, 10, 10, -1113, 22, -356});
   test.AddInput<int16_t>("T2", {2, 4}, {-8, -11, 13, 14, -99, 1234, 321, -6});
-  test.AddOutput<int32_t>("T3", {3, 4}, {-934, 12417, 3119, -158,
-                                         110107, -1373552, -357143, 6818,
-                                         35068, -439546, -113990, 2444});
+  test.AddOutput<int32_t>("T3", {3, 4},
+                          {-934, 12417, 3119, -158,
+                           110107, -1373552, -357143, 6818,
+                           35068, -439546, -113990, 2444});
   test.Run();
 }
 
diff --git a/onnxruntime/test/contrib_ops/transpose_matmul_op_test.cc b/onnxruntime/test/contrib_ops/transpose_matmul_op_test.cc
index d580b5f1d6..afe4f4f00d 100644
--- a/onnxruntime/test/contrib_ops/transpose_matmul_op_test.cc
+++ b/onnxruntime/test/contrib_ops/transpose_matmul_op_test.cc
@@ -84,6 +84,13 @@ std::vector<MatMulTestData<T>> GenerateSimpleTestCases() {
        {1, 2, 4},
        {20, 23, 26, 29, 56, 68, 80, 92}});
 
+  test_cases.push_back(
+      {"test 2D with empty input",
+       {0, 3},
+       {3, 4},
+       {0, 4},
+       {}});
+
   return test_cases;
 }
 
diff --git a/onnxruntime/test/providers/cpu/math/matmul_integer_test.cc b/onnxruntime/test/providers/cpu/math/matmul_integer_test.cc
index d5abeb5406..b54c04fe82 100644
--- a/onnxruntime/test/providers/cpu/math/matmul_integer_test.cc
+++ b/onnxruntime/test/providers/cpu/math/matmul_integer_test.cc
@@ -26,6 +26,15 @@ TEST(MatmulIntegerOpTest, MatMulInteger_2D) {
   test.Run();
 }
 
+TEST(MatmulIntegerOpTest, MatMulInteger_2D_empty_input) {
+  OpTester test("MatMulInteger", 10);
+  test.AddInput<uint8_t>("T1", {0, 3}, {});
+  test.AddInput<uint8_t>("T2", {3, 2}, {1, 4, 2, 5, 3, 6});
+  test.AddInput<uint8_t>("a_zero_point", {}, {12});
+  test.AddInput<uint8_t>("b_zero_point", {}, {0});
+  test.AddOutput<int32_t>("T3", {0, 2}, {});
+  test.Run();
+}
 TEST(MatmulIntegerOpTest, MatMulInteger) {
   OpTester test("MatMulInteger", 10);
   test.AddInput<uint8_t>("T1", {1, 1}, {11});
diff --git a/onnxruntime/test/providers/cpu/math/matmul_test.cc b/onnxruntime/test/providers/cpu/math/matmul_test.cc
index 418dee24f0..0f813a63aa 100644
--- a/onnxruntime/test/providers/cpu/math/matmul_test.cc
+++ b/onnxruntime/test/providers/cpu/math/matmul_test.cc
@@ -17,79 +17,84 @@ struct MatMulTestData {
 };
 
 template <typename T>
-std::vector<MatMulTestData<T>> GenerateTestCases()
-{
+std::vector<MatMulTestData<T>> GenerateTestCases() {
   std::vector<MatMulTestData<T>> test_cases;
 
   test_cases.push_back(
-    {"test padding and broadcast",
-    {3, 1, 1, 2},
-    {2, 2, 2},
-    {3, 2, 1, 2},
-    {2, 3, 6, 7, 6, 11, 26, 31, 10, 19, 46, 55}});
+      {"test padding and broadcast",
+       {3, 1, 1, 2},
+       {2, 2, 2},
+       {3, 2, 1, 2},
+       {2, 3, 6, 7, 6, 11, 26, 31, 10, 19, 46, 55}});
 
   test_cases.push_back(
-    {"test padding and broadcast",
-    {2, 3, 2},
-    {3, 2, 2, 1},
-    {3, 2, 3, 1},
-    {1, 3, 5, 33, 43, 53, 5, 23, 41, 85, 111, 137, 9, 43, 77, 137, 179, 221}});
+      {"test padding and broadcast",
+       {2, 3, 2},
+       {3, 2, 2, 1},
+       {3, 2, 3, 1},
+       {1, 3, 5, 33, 43, 53, 5, 23, 41, 85, 111, 137, 9, 43, 77, 137, 179, 221}});
 
   test_cases.push_back(
-    {"test left 1D",
-    {2},
-    {3, 2, 1},
-    {3, 1},
-    {1, 3, 5}});
+      {"test left 1D",
+       {2},
+       {3, 2, 1},
+       {3, 1},
+       {1, 3, 5}});
 
   test_cases.push_back(
-    {"test right 1D",
-    {3, 1, 2},
-    {2},
-    {3, 1},
-    {1, 3, 5}});
+      {"test right 1D",
+       {3, 1, 2},
+       {2},
+       {3, 1},
+       {1, 3, 5}});
 
   test_cases.push_back(
-    {"test scalar output",
-    {3},
-    {3},
-    {},
-    {5}});
+      {"test scalar output",
+       {3},
+       {3},
+       {},
+       {5}});
 
   test_cases.push_back(
-    {"test 2D",
-    {3, 4},
-    {4, 3},
-    {3, 3},
-    {42, 48, 54, 114, 136, 158, 186, 224, 262}});
+      {"test 2D",
+       {3, 4},
+       {4, 3},
+       {3, 3},
+       {42, 48, 54, 114, 136, 158, 186, 224, 262}});
 
   test_cases.push_back(
-    {"test 2D special",
-    {2, 2, 3},
-    {3, 4},
-    {2, 2, 4},
-    {20, 23, 26, 29, 56, 68, 80, 92, 92, 113, 134, 155, 128, 158, 188, 218}});
+      {"test 2D special",
+       {2, 2, 3},
+       {3, 4},
+       {2, 2, 4},
+       {20, 23, 26, 29, 56, 68, 80, 92, 92, 113, 134, 155, 128, 158, 188, 218}});
 
   test_cases.push_back(
-    {"test 2D special 2",
-    {2, 2, 3},
-    {1, 3, 4},
-    {2, 2, 4},
-    {20, 23, 26, 29, 56, 68, 80, 92, 92, 113, 134, 155, 128, 158, 188, 218}});
+      {"test 2D special 2",
+       {2, 2, 3},
+       {1, 3, 4},
+       {2, 2, 4},
+       {20, 23, 26, 29, 56, 68, 80, 92, 92, 113, 134, 155, 128, 158, 188, 218}});
 
   test_cases.push_back(
-    {"test 2D special 3",
-    {2, 6},
-    {1, 1, 6, 1},
-    {1, 1, 2, 1},
-    {55, 145}});
+      {"test 2D special 3",
+       {2, 6},
+       {1, 1, 6, 1},
+       {1, 1, 2, 1},
+       {55, 145}});
+
+  test_cases.push_back(
+      {"test 2D empty input",
+       {3, 4},
+       {4, 0},
+       {3, 0},
+       {}});
 
   return test_cases;
 }
 
 template <typename T>
-void RunMatMulTest(int32_t opset_version = 7)
-{
+void RunMatMulTest(int32_t opset_version = 7) {
   std::vector<T> common_input_vals{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
   for (auto t : GenerateTestCases<T>()) {
     OpTester test("MatMul", opset_version);
@@ -105,7 +110,7 @@ void RunMatMulTest(int32_t opset_version = 7)
     test.AddOutput<T>("Y", t.expected_dims, t.expected_vals);
 
     // OpenVINO EP: Disabled temporarily matmul broadcasting not fully supported
-    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider,kOpenVINOExecutionProvider});// Disable TensorRT because of unsupported data type
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider});  // Disable TensorRT because of unsupported data type
   }
 }
 
diff --git a/onnxruntime/test/providers/cpu/math/quantize_linear_matmul_test.cc b/onnxruntime/test/providers/cpu/math/quantize_linear_matmul_test.cc
index 32095a8a15..d6b17cf5e9 100644
--- a/onnxruntime/test/providers/cpu/math/quantize_linear_matmul_test.cc
+++ b/onnxruntime/test/providers/cpu/math/quantize_linear_matmul_test.cc
@@ -46,17 +46,33 @@ TEST(QuantizeLinearMatmulOpTest, QLinearMatMul3D) {
 }
 
 static void QLinearMatMul2DTest(bool only_t1_not_initializer) {
-  OpTester test("QLinearMatMul", 10);
-  test.AddInput<uint8_t>("T1", {2, 4}, {208, 236, 0, 238, 3, 214, 255, 29});
-  test.AddInput<float>("a_scale", {1}, {0.0066f}, only_t1_not_initializer);
-  test.AddInput<uint8_t>("a_zero_point", {1}, {113}, only_t1_not_initializer);
-  test.AddInput<uint8_t>("T2", {4, 3}, {152, 51, 244, 60, 26, 255, 0, 127, 246, 127, 254, 247}, only_t1_not_initializer);
-  test.AddInput<float>("b_scale", {1}, {0.00705f}, only_t1_not_initializer);
-  test.AddInput<uint8_t>("b_zero_point", {1}, {114}, only_t1_not_initializer);
-  test.AddInput<float>("y_scale", {1}, {0.0107f}, only_t1_not_initializer);
-  test.AddInput<uint8_t>("y_zero_point", {1}, {118}, only_t1_not_initializer);
-  test.AddOutput<uint8_t>("T3", {2, 3}, {168, 115, 255, 1, 66, 151});
-  test.Run();
+  // Test non-empty inputs
+  OpTester test_non_empty("QLinearMatMul", 10);
+  test_non_empty.AddInput<uint8_t>("T1", {2, 4}, {208, 236, 0, 238, 3, 214, 255, 29});
+  test_non_empty.AddInput<float>("a_scale", {1}, {0.0066f}, only_t1_not_initializer);
+  test_non_empty.AddInput<uint8_t>("a_zero_point", {1}, {113}, only_t1_not_initializer);
+  test_non_empty.AddInput<uint8_t>("T2", {4, 3}, {152, 51, 244, 60, 26, 255, 0, 127, 246, 127, 254, 247}, only_t1_not_initializer);
+  test_non_empty.AddInput<float>("b_scale", {1}, {0.00705f}, only_t1_not_initializer);
+  test_non_empty.AddInput<uint8_t>("b_zero_point", {1}, {114}, only_t1_not_initializer);
+  test_non_empty.AddInput<float>("y_scale", {1}, {0.0107f}, only_t1_not_initializer);
+  test_non_empty.AddInput<uint8_t>("y_zero_point", {1}, {118}, only_t1_not_initializer);
+  test_non_empty.AddOutput<uint8_t>("T3", {2, 3}, {168, 115, 255, 1, 66, 151});
+  test_non_empty.Run();
+
+  // Test with an empty input
+  OpTester test_empty("QLinearMatMul", 10);
+  test_empty.AddInput<uint8_t>("T1", {0, 4}, {});
+  test_empty.AddInput<float>("a_scale", {1}, {0.0066f}, only_t1_not_initializer);
+  test_empty.AddInput<uint8_t>("a_zero_point", {1}, {113}, only_t1_not_initializer);
+  test_empty.AddInput<uint8_t>("T2", {4, 3}, {152, 51, 244, 60, 26, 255, 0, 127, 246, 127, 254, 247}, only_t1_not_initializer);
+  test_empty.AddInput<float>("b_scale", {1}, {0.00705f}, only_t1_not_initializer);
+  test_empty.AddInput<uint8_t>("b_zero_point", {1}, {114}, only_t1_not_initializer);
+  test_empty.AddInput<float>("y_scale", {1}, {0.0107f}, only_t1_not_initializer);
+  test_empty.AddInput<uint8_t>("y_zero_point", {1}, {118}, only_t1_not_initializer);
+  test_empty.AddOutput<uint8_t>("T3", {0, 3}, {});
+
+  // Skip NNAPI as it doesn't support empty output for now
+  test_empty.Run(OpTester::ExpectResult::kExpectSuccess, "", {kNnapiExecutionProvider});
 }
 
 TEST(QuantizeLinearMatmulOpTest, QLinearMatMul) {