From 3c4f3d01cd2a79a3fb7f93edcb6d4b58fe4993ac Mon Sep 17 00:00:00 2001
From: Zhang Lei <zhang.huanning@hotmail.com>
Date: Thu, 14 May 2020 14:52:55 -0700
Subject: [PATCH] Implement QLinearLeakyRelu (#3648)

* Implement QLinearRelu and its unit test.
* Add logic to compute table during constructor when all parameters is constant.
* Fix test case rounding result related with rounding mode.
---
 .../contrib_ops/cpu/cpu_contrib_kernels.cc    |   4 +
 .../contrib_ops/cpu/qlinear_lookup_table.cc   | 121 ++++++++++++++++++
 .../contrib_ops/cpu/qlinear_lookup_table.h    |  26 ++++
 .../core/graph/contrib_ops/contrib_defs.cc    |  31 +++++
 .../contrib_ops/qlinear_lookup_table_test.cc  |  54 ++++++++
 5 files changed, 236 insertions(+)
 create mode 100644 onnxruntime/contrib_ops/cpu/qlinear_lookup_table.cc
 create mode 100644 onnxruntime/contrib_ops/cpu/qlinear_lookup_table.h
 create mode 100644 onnxruntime/test/contrib_ops/qlinear_lookup_table_test.cc

diff --git a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
index 444904aa2c..a81a675fed 100644
--- a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
+++ b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
@@ -32,6 +32,8 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int8_t, DequantizeLinear);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, QuantizeLinear);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int8_t, QuantizeLinear);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, QLinearLeakyRelu);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int8_t, QLinearLeakyRelu);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, CDist);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, double, CDist);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, Gelu);
@@ -109,6 +111,8 @@ Status RegisterCpuContribKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int8_t, DequantizeLinear)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, QuantizeLinear)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int8_t, QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, QLinearLeakyRelu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int8_t, QLinearLeakyRelu)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, CDist)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, double, CDist)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, BiasGelu)>,
diff --git a/onnxruntime/contrib_ops/cpu/qlinear_lookup_table.cc b/onnxruntime/contrib_ops/cpu/qlinear_lookup_table.cc
new file mode 100644
index 0000000000..ee52faae74
--- /dev/null
+++ b/onnxruntime/contrib_ops/cpu/qlinear_lookup_table.cc
@@ -0,0 +1,121 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "qlinear_lookup_table.h"
+#include "core/providers/common.h"
+#include "core/mlas/inc/mlas.h"
+
+namespace onnxruntime {
+namespace contrib {
+
+static void QLinearLookupTableTransform(const uint8_t* x, const uint8_t table[256], uint8_t* y, size_t n) {
+  for (; n >= 4; n -= 4) {
+    const size_t x_value0 = x[0];
+    const size_t x_value1 = x[1];
+    const size_t x_value2 = x[2];
+    const size_t x_value3 = x[3];
+    x += 4;
+    const uint8_t table_value0 = table[x_value0];
+    const uint8_t table_value1 = table[x_value1];
+    const uint8_t table_value2 = table[x_value2];
+    const uint8_t table_value3 = table[x_value3];
+
+    y[0] = table_value0;
+    y[1] = table_value1;
+    y[2] = table_value2;
+    y[3] = table_value3;
+    y += 4;
+  }
+  for (; n != 0; --n) {
+    const size_t x_value0 = *x++;
+    const uint8_t table_value0 = table[x_value0];
+    *y++ = table_value0;
+  }
+}
+
+template <typename T>
+static void BuildQLinearLeakyReluLookupTable(uint8_t table[256],
+                                             const Tensor* tensor_x_scale,
+                                             const Tensor* tensor_x_zero_point,
+                                             const Tensor* tensor_y_scale,
+                                             const Tensor* tensor_y_zero_point,
+                                             float alpha) {
+  ORT_ENFORCE(IsScalarOr1ElementVector(tensor_x_scale),
+              "QLinearLeakyRelu : input X_scale must be a scalar or 1D tensor of size 1");
+  ORT_ENFORCE(tensor_x_zero_point == nullptr || IsScalarOr1ElementVector(tensor_x_zero_point),
+              "QLinearLeakyRelu : input X_zero_point must be a scalar or 1D tensor of size 1");
+  ORT_ENFORCE(IsScalarOr1ElementVector(tensor_y_scale),
+              "QLinearLeakyRelu : input Y_scale must be a scalar or 1D tensor of size 1");
+  ORT_ENFORCE(tensor_y_zero_point == nullptr || IsScalarOr1ElementVector(tensor_y_zero_point),
+              "QLinearLeakyRelu : input Y_zero_point must be a scalar or 1D tensor of size 1");
+
+  const float X_scale = *(tensor_x_scale->Data<float>());
+  const T X_zero_point = (tensor_x_zero_point == nullptr) ? static_cast<T>(0) : *(tensor_x_zero_point->template Data<T>());
+  const float Y_scale = *(tensor_y_scale->Data<float>());
+  const T Y_zero_point = (tensor_y_zero_point == nullptr) ? static_cast<T>(0) : *(tensor_y_zero_point->template Data<T>());
+
+  float dequantized_vector[256];
+  for (int i = 0; i < 256; ++i) {
+    T x = static_cast<T>(i);
+    float x_dequantized = X_scale * (static_cast<int>(x) - static_cast<int>(X_zero_point));
+    dequantized_vector[i] = x_dequantized >= 0.0f ? x_dequantized : alpha * x_dequantized;
+  }
+  MlasQuantizeLinear(dequantized_vector, (T*)table, 256, Y_scale, Y_zero_point);
+}
+
+template <typename T>
+QLinearLeakyRelu<T>::QLinearLeakyRelu(const OpKernelInfo& info)
+    : OpKernel(info), alpha_(info.GetAttrOrDefault("alpha", 0.01f)) {
+  const Tensor* tensor_x_scale = nullptr;
+  const Tensor* tensor_x_zero_point = nullptr;
+  const Tensor* tensor_y_scale = nullptr;
+  const Tensor* tensor_y_zero_point = nullptr;
+
+  bool get_x_scale = info.TryGetConstantInput(1, &tensor_x_scale);
+  bool get_x_zero_point = !info.node().InputDefs()[2]->Exists() || info.TryGetConstantInput(2, &tensor_x_zero_point);
+  bool get_y_scale = info.TryGetConstantInput(3, &tensor_y_scale);
+  bool get_y_zero_point = !info.node().InputDefs()[4]->Exists() || info.TryGetConstantInput(4, &tensor_y_zero_point);
+  is_fixed_parameters_ = get_x_scale && get_x_zero_point && get_y_scale && get_y_zero_point;
+
+  if (is_fixed_parameters_) {
+    BuildQLinearLeakyReluLookupTable<T>(
+        fixed_lookup_table_, tensor_x_scale, tensor_x_zero_point,
+        tensor_y_scale, tensor_y_zero_point, alpha_);
+  }
+}
+
+template <typename T>
+Status QLinearLeakyRelu<T>::Compute(OpKernelContext* context) const {
+  const auto& X = *context->Input<Tensor>(0);
+  const auto& input_shape = X.Shape();
+  const auto N = input_shape.Size();
+  auto& Y = *context->Output(0, input_shape);
+
+  uint8_t table[256];
+  if (!is_fixed_parameters_) {
+    BuildQLinearLeakyReluLookupTable<T>(
+        table, context->Input<Tensor>(1), context->Input<Tensor>(2),
+        context->Input<Tensor>(3), context->Input<Tensor>(4), alpha_);
+  }
+
+  QLinearLookupTableTransform(
+      reinterpret_cast<const uint8_t*>(X.template Data<T>()),
+      is_fixed_parameters_ ? fixed_lookup_table_ : table,
+      reinterpret_cast<uint8_t*>(Y.template MutableData<T>()),
+      static_cast<size_t>(N));
+
+  return Status::OK();
+}
+
+#define REGISTER_QLINEAR_LOOKUPTABLE_TYPED_KERNEL(op_name, version, data_type, KERNEL_CLASS) \
+  ONNX_CPU_OPERATOR_TYPED_MS_KERNEL(                                                         \
+      op_name, version, data_type,                                                           \
+      KernelDefBuilder()                                                                     \
+          .TypeConstraint("T", DataTypeImpl::GetTensorType<data_type>()),                    \
+      KERNEL_CLASS<data_type>);
+
+REGISTER_QLINEAR_LOOKUPTABLE_TYPED_KERNEL(QLinearLeakyRelu, 1, int8_t, QLinearLeakyRelu);
+REGISTER_QLINEAR_LOOKUPTABLE_TYPED_KERNEL(QLinearLeakyRelu, 1, uint8_t, QLinearLeakyRelu);
+
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cpu/qlinear_lookup_table.h b/onnxruntime/contrib_ops/cpu/qlinear_lookup_table.h
new file mode 100644
index 0000000000..214f56f6f4
--- /dev/null
+++ b/onnxruntime/contrib_ops/cpu/qlinear_lookup_table.h
@@ -0,0 +1,26 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/common/common.h"
+#include "core/framework/op_kernel.h"
+
+namespace onnxruntime {
+namespace contrib {
+
+template <typename T>
+class QLinearLeakyRelu final : public OpKernel {
+ public:
+  QLinearLeakyRelu(const OpKernelInfo& info);
+
+  Status Compute(OpKernelContext* context) const override;
+
+ private:
+  const float alpha_;
+  bool is_fixed_parameters_;   // Fixed Scale and Zero Point for both x and y
+  uint8_t fixed_lookup_table_[256];  // when is const paramter, table value is here.
+};
+
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
index 047dec091e..ebe34fbd29 100644
--- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
@@ -2078,6 +2078,37 @@ Output = Dequantize(Input) -> AveragePool on fp32 data -> Quantize(output)
         ONNX_NAMESPACE::convPoolShapeInference(ctx, false, true, 0, 5);
       });
 
+  const char* QLinearLeakyReluDoc_ver1 =  R"DOC(
+QLinearLeakyRelu takes quantized input data (Tensor), an argument alpha, and quantize parameter for output,
+and produces one output data (Tensor<T>) where the function `f(x) = quantize(alpha * dequantize(x)) for dequantize(x) < 0`,
+`f(x) = quantize(dequantize(x)) for dequantize(x) >= 0`, is applied to the data tensor elementwise.
+)DOC";
+
+  ONNX_CONTRIB_OPERATOR_SCHEMA(QLinearLeakyRelu)
+      .SetDomain(kMSDomain)
+      .SinceVersion(1)
+      .SetDoc(QLinearLeakyReluDoc_ver1)
+      .Attr("alpha", "Coefficient of leakage.", AttributeProto::FLOAT, 0.01f)
+      .Input(0, "X", "Input tensor", "T")
+      .Input(1, "X_scale",
+             "Input X's scale. It's a scalar, which means a per-tensor/layer quantization.",
+             "tensor(float)")
+      .Input(2, "X_zero_point",
+             "Input X's zero point. Default value is 0 if it's not specified. It's a scalar, which means a per-tensor/layer quantization.",
+             "T", OpSchema::Optional)
+      .Input(3, "Y_scale",
+             "Output Y's scale. It's a scalar, which means a per-tensor/layer quantization.",
+             "tensor(float)")
+      .Input(4, "Y_zero_point",
+             "Output Y's zero point. Default value is 0 if it's not specified. It's a scalar, which means a per-tensor/layer quantization.",
+             "T", OpSchema::Optional)
+      .Output(0, "Y", "Output tensor", "T")
+      .TypeConstraint(
+          "T",
+          {"tensor(uint8)", "tensor(int8)"},
+          "Constrain input and output types to 8 bit tensors.")
+      .TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput);
+
   ONNX_CONTRIB_OPERATOR_SCHEMA(MurmurHash3)
       .SetDomain(kMSDomain)
       .SinceVersion(1)
diff --git a/onnxruntime/test/contrib_ops/qlinear_lookup_table_test.cc b/onnxruntime/test/contrib_ops/qlinear_lookup_table_test.cc
new file mode 100644
index 0000000000..2c7a642bcd
--- /dev/null
+++ b/onnxruntime/test/contrib_ops/qlinear_lookup_table_test.cc
@@ -0,0 +1,54 @@
+#include "gtest/gtest.h"
+#include "test/common/tensor_op_test_utils.h"
+#include "test/providers/provider_test_utils.h"
+
+#include <cfenv>
+
+namespace onnxruntime {
+namespace test {
+
+TEST(QLinearLookupTableBasedOperatorTests, QLinearLeakyRelu_Int8) {
+  OpTester test("QLinearLeakyRelu", 1, onnxruntime::kMSDomain);
+  test.AddAttribute<float>("alpha", 0.1f);
+  float X_scale = 0.25f;
+  //int8_t X_zero_point = 0;
+  float Y_scale = 0.1f;
+  int8_t Y_zero_point = -100;
+
+  std::vector<int64_t> dims = {16};
+  test.AddInput<int8_t>("X", dims, {0, 16, 17, 18, 19, 90, 91, 127, -128, -110, -108, -100, -16, -17, -18, -1});
+  test.AddInput<float>("X_scale", {}, {X_scale});
+  test.AddMissingOptionalInput<int8_t>(); // optional "X_zero_point" using default value here
+  test.AddInput<float>("Y_scale", {}, {Y_scale});
+  test.AddInput<int8_t>("Y_zero_point", {}, {Y_zero_point});
+  test.AddOutput<int8_t>("Y", dims, {-100, -60, -58, -55, -52, 125, 127, 127, -128, -128, -127, -125, -104, -104, -104, -100});
+  auto origin_round_mode = std::fegetround();
+  std::fesetround(FE_TONEAREST);
+  test.Run();
+  std::fesetround(origin_round_mode);
+}
+
+
+TEST(QLinearLookupTableBasedOperatorTests, QLinearLeakyRelu_UInt8) {
+  OpTester test("QLinearLeakyRelu", 1, onnxruntime::kMSDomain);
+  test.AddAttribute<float>("alpha", 0.1f);
+  float X_scale = 0.25f;
+  uint8_t X_zero_point = 128;
+  float Y_scale = 0.1f;
+  uint8_t Y_zero_point = 30;
+
+  std::vector<int64_t> dims = {16};
+  test.AddInput<uint8_t>("X", dims, {0, 16, 17, 18, 19, 90, 91, 127, 128, 136, 137, 138, 216, 217, 218, 255});
+  test.AddInput<float>("X_scale", {}, {X_scale});
+  test.AddInput<uint8_t>("X_zero_point", {}, {X_zero_point});
+  test.AddInput<float>("Y_scale", {}, {Y_scale});
+  test.AddInput<uint8_t>("Y_zero_point", {}, {Y_zero_point});
+  test.AddOutput<uint8_t>("Y", dims, {0, 2, 2, 2, 3, 20, 21, 30, 30, 50, 52, 55, 250, 252, 255, 255});
+  auto origin_round_mode = std::fegetround();
+  std::fesetround(FE_TONEAREST);
+  test.Run();
+  std::fesetround(origin_round_mode);
+}
+
+}
+}